1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
27 */
28
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/vdev_impl.h>
32 #ifndef __FreeBSD__
33 #include <sys/vdev_disk.h>
34 #endif
35 #include <sys/vdev_file.h>
36 #include <sys/vdev_raidz.h>
37 #include <sys/zio.h>
38 #include <sys/zio_checksum.h>
39 #include <sys/fs/zfs.h>
40 #include <sys/fm/fs/zfs.h>
41 #ifdef __FreeBSD__
42 #include <sys/bio.h>
43 #endif
44
45 /*
46 * Virtual device vector for RAID-Z.
47 *
48 * This vdev supports single, double, and triple parity. For single parity,
49 * we use a simple XOR of all the data columns. For double or triple parity,
50 * we use a special case of Reed-Solomon coding. This extends the
51 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
52 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
53 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
54 * former is also based. The latter is designed to provide higher performance
55 * for writes.
56 *
57 * Note that the Plank paper claimed to support arbitrary N+M, but was then
58 * amended six years later identifying a critical flaw that invalidates its
59 * claims. Nevertheless, the technique can be adapted to work for up to
60 * triple parity. For additional parity, the amendment "Note: Correction to
61 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
62 * is viable, but the additional complexity means that write performance will
63 * suffer.
64 *
65 * All of the methods above operate on a Galois field, defined over the
66 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
67 * can be expressed with a single byte. Briefly, the operations on the
68 * field are defined as follows:
69 *
70 * o addition (+) is represented by a bitwise XOR
71 * o subtraction (-) is therefore identical to addition: A + B = A - B
72 * o multiplication of A by 2 is defined by the following bitwise expression:
73 *
74 * (A * 2)_7 = A_6
75 * (A * 2)_6 = A_5
76 * (A * 2)_5 = A_4
77 * (A * 2)_4 = A_3 + A_7
78 * (A * 2)_3 = A_2 + A_7
79 * (A * 2)_2 = A_1 + A_7
80 * (A * 2)_1 = A_0
81 * (A * 2)_0 = A_7
82 *
83 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
84 * As an aside, this multiplication is derived from the error correcting
85 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
86 *
87 * Observe that any number in the field (except for 0) can be expressed as a
88 * power of 2 -- a generator for the field. We store a table of the powers of
89 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
90 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
91 * than field addition). The inverse of a field element A (A^-1) is therefore
92 * A ^ (255 - 1) = A^254.
93 *
94 * The up-to-three parity columns, P, Q, R over several data columns,
95 * D_0, ... D_n-1, can be expressed by field operations:
96 *
97 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
98 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
99 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
100 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
101 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
102 *
103 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
104 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
105 * independent coefficients. (There are no additional coefficients that have
106 * this property which is why the uncorrected Plank method breaks down.)
107 *
108 * See the reconstruction code below for how P, Q and R can used individually
109 * or in concert to recover missing data columns.
110 */
111
112 typedef struct raidz_col {
113 uint64_t rc_devidx; /* child device index for I/O */
114 uint64_t rc_offset; /* device offset */
115 uint64_t rc_size; /* I/O size */
116 void *rc_data; /* I/O data */
117 void *rc_gdata; /* used to store the "good" version */
118 int rc_error; /* I/O error for this device */
119 uint8_t rc_tried; /* Did we attempt this I/O column? */
120 uint8_t rc_skipped; /* Did we skip this I/O column? */
121 } raidz_col_t;
122
123 typedef struct raidz_map {
124 uint64_t rm_cols; /* Regular column count */
125 uint64_t rm_scols; /* Count including skipped columns */
126 uint64_t rm_bigcols; /* Number of oversized columns */
127 uint64_t rm_asize; /* Actual total I/O size */
128 uint64_t rm_missingdata; /* Count of missing data devices */
129 uint64_t rm_missingparity; /* Count of missing parity devices */
130 uint64_t rm_firstdatacol; /* First data column/parity count */
131 uint64_t rm_nskip; /* Skipped sectors for padding */
132 uint64_t rm_skipstart; /* Column index of padding start */
133 void *rm_datacopy; /* rm_asize-buffer of copied data */
134 uintptr_t rm_reports; /* # of referencing checksum reports */
135 uint8_t rm_freed; /* map no longer has referencing ZIO */
136 uint8_t rm_ecksuminjected; /* checksum error was injected */
137 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
138 } raidz_map_t;
139
140 #define VDEV_RAIDZ_P 0
141 #define VDEV_RAIDZ_Q 1
142 #define VDEV_RAIDZ_R 2
143
144 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
145 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
146
147 /*
148 * We provide a mechanism to perform the field multiplication operation on a
149 * 64-bit value all at once rather than a byte at a time. This works by
150 * creating a mask from the top bit in each byte and using that to
151 * conditionally apply the XOR of 0x1d.
152 */
153 #define VDEV_RAIDZ_64MUL_2(x, mask) \
154 { \
155 (mask) = (x) & 0x8080808080808080ULL; \
156 (mask) = ((mask) << 1) - ((mask) >> 7); \
157 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
158 ((mask) & 0x1d1d1d1d1d1d1d1d); \
159 }
160
161 #define VDEV_RAIDZ_64MUL_4(x, mask) \
162 { \
163 VDEV_RAIDZ_64MUL_2((x), mask); \
164 VDEV_RAIDZ_64MUL_2((x), mask); \
165 }
166
167 #define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)
168
169 /*
170 * Force reconstruction to use the general purpose method.
171 */
172 int vdev_raidz_default_to_general;
173
174 /* Powers of 2 in the Galois field defined above. */
175 static const uint8_t vdev_raidz_pow2[256] = {
176 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
177 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
178 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
179 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
180 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
181 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
182 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
183 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
184 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
185 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
186 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
187 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
188 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
189 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
190 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
191 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
192 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
193 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
194 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
195 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
196 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
197 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
198 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
199 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
200 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
201 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
202 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
203 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
204 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
205 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
206 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
207 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
208 };
209 /* Logs of 2 in the Galois field defined above. */
210 static const uint8_t vdev_raidz_log2[256] = {
211 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
212 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
213 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
214 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
215 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
216 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
217 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
218 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
219 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
220 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
221 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
222 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
223 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
224 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
225 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
226 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
227 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
228 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
229 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
230 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
231 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
232 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
233 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
234 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
235 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
236 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
237 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
238 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
239 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
240 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
241 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
242 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
243 };
244
245 static void vdev_raidz_generate_parity(raidz_map_t *rm);
246
247 /*
248 * Multiply a given number by 2 raised to the given power.
249 */
250 static uint8_t
vdev_raidz_exp2(uint_t a,int exp)251 vdev_raidz_exp2(uint_t a, int exp)
252 {
253 if (a == 0)
254 return (0);
255
256 ASSERT(exp >= 0);
257 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
258
259 exp += vdev_raidz_log2[a];
260 if (exp > 255)
261 exp -= 255;
262
263 return (vdev_raidz_pow2[exp]);
264 }
265
266 static void
vdev_raidz_map_free(raidz_map_t * rm)267 vdev_raidz_map_free(raidz_map_t *rm)
268 {
269 int c;
270 size_t size;
271
272 for (c = 0; c < rm->rm_firstdatacol; c++) {
273 if (rm->rm_col[c].rc_data != NULL)
274 zio_buf_free(rm->rm_col[c].rc_data,
275 rm->rm_col[c].rc_size);
276
277 if (rm->rm_col[c].rc_gdata != NULL)
278 zio_buf_free(rm->rm_col[c].rc_gdata,
279 rm->rm_col[c].rc_size);
280 }
281
282 size = 0;
283 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
284 size += rm->rm_col[c].rc_size;
285
286 if (rm->rm_datacopy != NULL)
287 zio_buf_free(rm->rm_datacopy, size);
288
289 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
290 }
291
292 static void
vdev_raidz_map_free_vsd(zio_t * zio)293 vdev_raidz_map_free_vsd(zio_t *zio)
294 {
295 raidz_map_t *rm = zio->io_vsd;
296
297 ASSERT0(rm->rm_freed);
298 rm->rm_freed = 1;
299
300 if (rm->rm_reports == 0)
301 vdev_raidz_map_free(rm);
302 }
303
304 /*ARGSUSED*/
305 static void
vdev_raidz_cksum_free(void * arg,size_t ignored)306 vdev_raidz_cksum_free(void *arg, size_t ignored)
307 {
308 raidz_map_t *rm = arg;
309
310 ASSERT3U(rm->rm_reports, >, 0);
311
312 if (--rm->rm_reports == 0 && rm->rm_freed != 0)
313 vdev_raidz_map_free(rm);
314 }
315
316 static void
vdev_raidz_cksum_finish(zio_cksum_report_t * zcr,const void * good_data)317 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
318 {
319 raidz_map_t *rm = zcr->zcr_cbdata;
320 size_t c = zcr->zcr_cbinfo;
321 size_t x;
322
323 const char *good = NULL;
324 const char *bad = rm->rm_col[c].rc_data;
325
326 if (good_data == NULL) {
327 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
328 return;
329 }
330
331 if (c < rm->rm_firstdatacol) {
332 /*
333 * The first time through, calculate the parity blocks for
334 * the good data (this relies on the fact that the good
335 * data never changes for a given logical ZIO)
336 */
337 if (rm->rm_col[0].rc_gdata == NULL) {
338 char *bad_parity[VDEV_RAIDZ_MAXPARITY];
339 char *buf;
340
341 /*
342 * Set up the rm_col[]s to generate the parity for
343 * good_data, first saving the parity bufs and
344 * replacing them with buffers to hold the result.
345 */
346 for (x = 0; x < rm->rm_firstdatacol; x++) {
347 bad_parity[x] = rm->rm_col[x].rc_data;
348 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
349 zio_buf_alloc(rm->rm_col[x].rc_size);
350 }
351
352 /* fill in the data columns from good_data */
353 buf = (char *)good_data;
354 for (; x < rm->rm_cols; x++) {
355 rm->rm_col[x].rc_data = buf;
356 buf += rm->rm_col[x].rc_size;
357 }
358
359 /*
360 * Construct the parity from the good data.
361 */
362 vdev_raidz_generate_parity(rm);
363
364 /* restore everything back to its original state */
365 for (x = 0; x < rm->rm_firstdatacol; x++)
366 rm->rm_col[x].rc_data = bad_parity[x];
367
368 buf = rm->rm_datacopy;
369 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
370 rm->rm_col[x].rc_data = buf;
371 buf += rm->rm_col[x].rc_size;
372 }
373 }
374
375 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
376 good = rm->rm_col[c].rc_gdata;
377 } else {
378 /* adjust good_data to point at the start of our column */
379 good = good_data;
380
381 for (x = rm->rm_firstdatacol; x < c; x++)
382 good += rm->rm_col[x].rc_size;
383 }
384
385 /* we drop the ereport if it ends up that the data was good */
386 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
387 }
388
389 /*
390 * Invoked indirectly by zfs_ereport_start_checksum(), called
391 * below when our read operation fails completely. The main point
392 * is to keep a copy of everything we read from disk, so that at
393 * vdev_raidz_cksum_finish() time we can compare it with the good data.
394 */
395 static void
vdev_raidz_cksum_report(zio_t * zio,zio_cksum_report_t * zcr,void * arg)396 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
397 {
398 size_t c = (size_t)(uintptr_t)arg;
399 caddr_t buf;
400
401 raidz_map_t *rm = zio->io_vsd;
402 size_t size;
403
404 /* set up the report and bump the refcount */
405 zcr->zcr_cbdata = rm;
406 zcr->zcr_cbinfo = c;
407 zcr->zcr_finish = vdev_raidz_cksum_finish;
408 zcr->zcr_free = vdev_raidz_cksum_free;
409
410 rm->rm_reports++;
411 ASSERT3U(rm->rm_reports, >, 0);
412
413 if (rm->rm_datacopy != NULL)
414 return;
415
416 /*
417 * It's the first time we're called for this raidz_map_t, so we need
418 * to copy the data aside; there's no guarantee that our zio's buffer
419 * won't be re-used for something else.
420 *
421 * Our parity data is already in separate buffers, so there's no need
422 * to copy them.
423 */
424
425 size = 0;
426 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
427 size += rm->rm_col[c].rc_size;
428
429 buf = rm->rm_datacopy = zio_buf_alloc(size);
430
431 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
432 raidz_col_t *col = &rm->rm_col[c];
433
434 bcopy(col->rc_data, buf, col->rc_size);
435 col->rc_data = buf;
436
437 buf += col->rc_size;
438 }
439 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
440 }
441
442 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
443 vdev_raidz_map_free_vsd,
444 vdev_raidz_cksum_report
445 };
446
447 /*
448 * Divides the IO evenly across all child vdevs; usually, dcols is
449 * the number of children in the target vdev.
450 */
451 static raidz_map_t *
vdev_raidz_map_alloc(caddr_t data,uint64_t size,uint64_t offset,boolean_t dofree,uint64_t unit_shift,uint64_t dcols,uint64_t nparity)452 vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree,
453 uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
454 {
455 raidz_map_t *rm;
456 /* The starting RAIDZ (parent) vdev sector of the block. */
457 uint64_t b = offset >> unit_shift;
458 /* The zio's size in units of the vdev's minimum sector size. */
459 uint64_t s = size >> unit_shift;
460 /* The first column for this stripe. */
461 uint64_t f = b % dcols;
462 /* The starting byte offset on each child vdev. */
463 uint64_t o = (b / dcols) << unit_shift;
464 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
465
466 /*
467 * "Quotient": The number of data sectors for this stripe on all but
468 * the "big column" child vdevs that also contain "remainder" data.
469 */
470 q = s / (dcols - nparity);
471
472 /*
473 * "Remainder": The number of partial stripe data sectors in this I/O.
474 * This will add a sector to some, but not all, child vdevs.
475 */
476 r = s - q * (dcols - nparity);
477
478 /* The number of "big columns" - those which contain remainder data. */
479 bc = (r == 0 ? 0 : r + nparity);
480
481 /*
482 * The total number of data and parity sectors associated with
483 * this I/O.
484 */
485 tot = s + nparity * (q + (r == 0 ? 0 : 1));
486
487 /* acols: The columns that will be accessed. */
488 /* scols: The columns that will be accessed or skipped. */
489 if (q == 0) {
490 /* Our I/O request doesn't span all child vdevs. */
491 acols = bc;
492 scols = MIN(dcols, roundup(bc, nparity + 1));
493 } else {
494 acols = dcols;
495 scols = dcols;
496 }
497
498 ASSERT3U(acols, <=, scols);
499
500 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
501
502 rm->rm_cols = acols;
503 rm->rm_scols = scols;
504 rm->rm_bigcols = bc;
505 rm->rm_skipstart = bc;
506 rm->rm_missingdata = 0;
507 rm->rm_missingparity = 0;
508 rm->rm_firstdatacol = nparity;
509 rm->rm_datacopy = NULL;
510 rm->rm_reports = 0;
511 rm->rm_freed = 0;
512 rm->rm_ecksuminjected = 0;
513
514 asize = 0;
515
516 for (c = 0; c < scols; c++) {
517 col = f + c;
518 coff = o;
519 if (col >= dcols) {
520 col -= dcols;
521 coff += 1ULL << unit_shift;
522 }
523 rm->rm_col[c].rc_devidx = col;
524 rm->rm_col[c].rc_offset = coff;
525 rm->rm_col[c].rc_data = NULL;
526 rm->rm_col[c].rc_gdata = NULL;
527 rm->rm_col[c].rc_error = 0;
528 rm->rm_col[c].rc_tried = 0;
529 rm->rm_col[c].rc_skipped = 0;
530
531 if (c >= acols)
532 rm->rm_col[c].rc_size = 0;
533 else if (c < bc)
534 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
535 else
536 rm->rm_col[c].rc_size = q << unit_shift;
537
538 asize += rm->rm_col[c].rc_size;
539 }
540
541 ASSERT3U(asize, ==, tot << unit_shift);
542 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
543 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
544 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
545 ASSERT3U(rm->rm_nskip, <=, nparity);
546
547 if (!dofree) {
548 for (c = 0; c < rm->rm_firstdatacol; c++) {
549 rm->rm_col[c].rc_data =
550 zio_buf_alloc(rm->rm_col[c].rc_size);
551 }
552
553 rm->rm_col[c].rc_data = data;
554
555 for (c = c + 1; c < acols; c++) {
556 rm->rm_col[c].rc_data =
557 (char *)rm->rm_col[c - 1].rc_data +
558 rm->rm_col[c - 1].rc_size;
559 }
560 }
561
562 /*
563 * If all data stored spans all columns, there's a danger that parity
564 * will always be on the same device and, since parity isn't read
565 * during normal operation, that that device's I/O bandwidth won't be
566 * used effectively. We therefore switch the parity every 1MB.
567 *
568 * ... at least that was, ostensibly, the theory. As a practical
569 * matter unless we juggle the parity between all devices evenly, we
570 * won't see any benefit. Further, occasional writes that aren't a
571 * multiple of the LCM of the number of children and the minimum
572 * stripe width are sufficient to avoid pessimal behavior.
573 * Unfortunately, this decision created an implicit on-disk format
574 * requirement that we need to support for all eternity, but only
575 * for single-parity RAID-Z.
576 *
577 * If we intend to skip a sector in the zeroth column for padding
578 * we must make sure to note this swap. We will never intend to
579 * skip the first column since at least one data and one parity
580 * column must appear in each row.
581 */
582 ASSERT(rm->rm_cols >= 2);
583 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
584
585 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
586 devidx = rm->rm_col[0].rc_devidx;
587 o = rm->rm_col[0].rc_offset;
588 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
589 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
590 rm->rm_col[1].rc_devidx = devidx;
591 rm->rm_col[1].rc_offset = o;
592
593 if (rm->rm_skipstart == 0)
594 rm->rm_skipstart = 1;
595 }
596
597 return (rm);
598 }
599
600 static void
vdev_raidz_generate_parity_p(raidz_map_t * rm)601 vdev_raidz_generate_parity_p(raidz_map_t *rm)
602 {
603 uint64_t *p, *src, pcount, ccount, i;
604 int c;
605
606 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
607
608 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
609 src = rm->rm_col[c].rc_data;
610 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
611 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
612
613 if (c == rm->rm_firstdatacol) {
614 ASSERT(ccount == pcount);
615 for (i = 0; i < ccount; i++, src++, p++) {
616 *p = *src;
617 }
618 } else {
619 ASSERT(ccount <= pcount);
620 for (i = 0; i < ccount; i++, src++, p++) {
621 *p ^= *src;
622 }
623 }
624 }
625 }
626
627 static void
vdev_raidz_generate_parity_pq(raidz_map_t * rm)628 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
629 {
630 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
631 int c;
632
633 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
634 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
635 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
636
637 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
638 src = rm->rm_col[c].rc_data;
639 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
640 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
641
642 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
643
644 if (c == rm->rm_firstdatacol) {
645 ASSERT(ccnt == pcnt || ccnt == 0);
646 for (i = 0; i < ccnt; i++, src++, p++, q++) {
647 *p = *src;
648 *q = *src;
649 }
650 for (; i < pcnt; i++, src++, p++, q++) {
651 *p = 0;
652 *q = 0;
653 }
654 } else {
655 ASSERT(ccnt <= pcnt);
656
657 /*
658 * Apply the algorithm described above by multiplying
659 * the previous result and adding in the new value.
660 */
661 for (i = 0; i < ccnt; i++, src++, p++, q++) {
662 *p ^= *src;
663
664 VDEV_RAIDZ_64MUL_2(*q, mask);
665 *q ^= *src;
666 }
667
668 /*
669 * Treat short columns as though they are full of 0s.
670 * Note that there's therefore nothing needed for P.
671 */
672 for (; i < pcnt; i++, q++) {
673 VDEV_RAIDZ_64MUL_2(*q, mask);
674 }
675 }
676 }
677 }
678
679 static void
vdev_raidz_generate_parity_pqr(raidz_map_t * rm)680 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
681 {
682 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
683 int c;
684
685 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
686 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
687 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
688 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
689 rm->rm_col[VDEV_RAIDZ_R].rc_size);
690
691 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
692 src = rm->rm_col[c].rc_data;
693 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
694 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
695 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
696
697 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
698
699 if (c == rm->rm_firstdatacol) {
700 ASSERT(ccnt == pcnt || ccnt == 0);
701 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
702 *p = *src;
703 *q = *src;
704 *r = *src;
705 }
706 for (; i < pcnt; i++, src++, p++, q++, r++) {
707 *p = 0;
708 *q = 0;
709 *r = 0;
710 }
711 } else {
712 ASSERT(ccnt <= pcnt);
713
714 /*
715 * Apply the algorithm described above by multiplying
716 * the previous result and adding in the new value.
717 */
718 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
719 *p ^= *src;
720
721 VDEV_RAIDZ_64MUL_2(*q, mask);
722 *q ^= *src;
723
724 VDEV_RAIDZ_64MUL_4(*r, mask);
725 *r ^= *src;
726 }
727
728 /*
729 * Treat short columns as though they are full of 0s.
730 * Note that there's therefore nothing needed for P.
731 */
732 for (; i < pcnt; i++, q++, r++) {
733 VDEV_RAIDZ_64MUL_2(*q, mask);
734 VDEV_RAIDZ_64MUL_4(*r, mask);
735 }
736 }
737 }
738 }
739
740 /*
741 * Generate RAID parity in the first virtual columns according to the number of
742 * parity columns available.
743 */
744 static void
vdev_raidz_generate_parity(raidz_map_t * rm)745 vdev_raidz_generate_parity(raidz_map_t *rm)
746 {
747 switch (rm->rm_firstdatacol) {
748 case 1:
749 vdev_raidz_generate_parity_p(rm);
750 break;
751 case 2:
752 vdev_raidz_generate_parity_pq(rm);
753 break;
754 case 3:
755 vdev_raidz_generate_parity_pqr(rm);
756 break;
757 default:
758 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
759 }
760 }
761
762 static int
vdev_raidz_reconstruct_p(raidz_map_t * rm,int * tgts,int ntgts)763 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
764 {
765 uint64_t *dst, *src, xcount, ccount, count, i;
766 int x = tgts[0];
767 int c;
768
769 ASSERT(ntgts == 1);
770 ASSERT(x >= rm->rm_firstdatacol);
771 ASSERT(x < rm->rm_cols);
772
773 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
774 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
775 ASSERT(xcount > 0);
776
777 src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
778 dst = rm->rm_col[x].rc_data;
779 for (i = 0; i < xcount; i++, dst++, src++) {
780 *dst = *src;
781 }
782
783 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
784 src = rm->rm_col[c].rc_data;
785 dst = rm->rm_col[x].rc_data;
786
787 if (c == x)
788 continue;
789
790 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
791 count = MIN(ccount, xcount);
792
793 for (i = 0; i < count; i++, dst++, src++) {
794 *dst ^= *src;
795 }
796 }
797
798 return (1 << VDEV_RAIDZ_P);
799 }
800
801 static int
vdev_raidz_reconstruct_q(raidz_map_t * rm,int * tgts,int ntgts)802 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
803 {
804 uint64_t *dst, *src, xcount, ccount, count, mask, i;
805 uint8_t *b;
806 int x = tgts[0];
807 int c, j, exp;
808
809 ASSERT(ntgts == 1);
810
811 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
812 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
813
814 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
815 src = rm->rm_col[c].rc_data;
816 dst = rm->rm_col[x].rc_data;
817
818 if (c == x)
819 ccount = 0;
820 else
821 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
822
823 count = MIN(ccount, xcount);
824
825 if (c == rm->rm_firstdatacol) {
826 for (i = 0; i < count; i++, dst++, src++) {
827 *dst = *src;
828 }
829 for (; i < xcount; i++, dst++) {
830 *dst = 0;
831 }
832
833 } else {
834 for (i = 0; i < count; i++, dst++, src++) {
835 VDEV_RAIDZ_64MUL_2(*dst, mask);
836 *dst ^= *src;
837 }
838
839 for (; i < xcount; i++, dst++) {
840 VDEV_RAIDZ_64MUL_2(*dst, mask);
841 }
842 }
843 }
844
845 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
846 dst = rm->rm_col[x].rc_data;
847 exp = 255 - (rm->rm_cols - 1 - x);
848
849 for (i = 0; i < xcount; i++, dst++, src++) {
850 *dst ^= *src;
851 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
852 *b = vdev_raidz_exp2(*b, exp);
853 }
854 }
855
856 return (1 << VDEV_RAIDZ_Q);
857 }
858
859 static int
vdev_raidz_reconstruct_pq(raidz_map_t * rm,int * tgts,int ntgts)860 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
861 {
862 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
863 void *pdata, *qdata;
864 uint64_t xsize, ysize, i;
865 int x = tgts[0];
866 int y = tgts[1];
867
868 ASSERT(ntgts == 2);
869 ASSERT(x < y);
870 ASSERT(x >= rm->rm_firstdatacol);
871 ASSERT(y < rm->rm_cols);
872
873 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
874
875 /*
876 * Move the parity data aside -- we're going to compute parity as
877 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
878 * reuse the parity generation mechanism without trashing the actual
879 * parity so we make those columns appear to be full of zeros by
880 * setting their lengths to zero.
881 */
882 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
883 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
884 xsize = rm->rm_col[x].rc_size;
885 ysize = rm->rm_col[y].rc_size;
886
887 rm->rm_col[VDEV_RAIDZ_P].rc_data =
888 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
889 rm->rm_col[VDEV_RAIDZ_Q].rc_data =
890 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
891 rm->rm_col[x].rc_size = 0;
892 rm->rm_col[y].rc_size = 0;
893
894 vdev_raidz_generate_parity_pq(rm);
895
896 rm->rm_col[x].rc_size = xsize;
897 rm->rm_col[y].rc_size = ysize;
898
899 p = pdata;
900 q = qdata;
901 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
902 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
903 xd = rm->rm_col[x].rc_data;
904 yd = rm->rm_col[y].rc_data;
905
906 /*
907 * We now have:
908 * Pxy = P + D_x + D_y
909 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
910 *
911 * We can then solve for D_x:
912 * D_x = A * (P + Pxy) + B * (Q + Qxy)
913 * where
914 * A = 2^(x - y) * (2^(x - y) + 1)^-1
915 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
916 *
917 * With D_x in hand, we can easily solve for D_y:
918 * D_y = P + Pxy + D_x
919 */
920
921 a = vdev_raidz_pow2[255 + x - y];
922 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
923 tmp = 255 - vdev_raidz_log2[a ^ 1];
924
925 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
926 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
927
928 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
929 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
930 vdev_raidz_exp2(*q ^ *qxy, bexp);
931
932 if (i < ysize)
933 *yd = *p ^ *pxy ^ *xd;
934 }
935
936 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
937 rm->rm_col[VDEV_RAIDZ_P].rc_size);
938 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
939 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
940
941 /*
942 * Restore the saved parity data.
943 */
944 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
945 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
946
947 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
948 }
949
950 /* BEGIN CSTYLED */
951 /*
952 * In the general case of reconstruction, we must solve the system of linear
953 * equations defined by the coeffecients used to generate parity as well as
954 * the contents of the data and parity disks. This can be expressed with
955 * vectors for the original data (D) and the actual data (d) and parity (p)
956 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
957 *
958 * __ __ __ __
959 * | | __ __ | p_0 |
960 * | V | | D_0 | | p_m-1 |
961 * | | x | : | = | d_0 |
962 * | I | | D_n-1 | | : |
963 * | | ~~ ~~ | d_n-1 |
964 * ~~ ~~ ~~ ~~
965 *
966 * I is simply a square identity matrix of size n, and V is a vandermonde
967 * matrix defined by the coeffecients we chose for the various parity columns
968 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
969 * computation as well as linear separability.
970 *
971 * __ __ __ __
972 * | 1 .. 1 1 1 | | p_0 |
973 * | 2^n-1 .. 4 2 1 | __ __ | : |
974 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
975 * | 1 .. 0 0 0 | | D_1 | | d_0 |
976 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
977 * | : : : : | | : | | d_2 |
978 * | 0 .. 1 0 0 | | D_n-1 | | : |
979 * | 0 .. 0 1 0 | ~~ ~~ | : |
980 * | 0 .. 0 0 1 | | d_n-1 |
981 * ~~ ~~ ~~ ~~
982 *
983 * Note that I, V, d, and p are known. To compute D, we must invert the
984 * matrix and use the known data and parity values to reconstruct the unknown
985 * data values. We begin by removing the rows in V|I and d|p that correspond
986 * to failed or missing columns; we then make V|I square (n x n) and d|p
987 * sized n by removing rows corresponding to unused parity from the bottom up
988 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
989 * using Gauss-Jordan elimination. In the example below we use m=3 parity
990 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
991 * __ __
992 * | 1 1 1 1 1 1 1 1 |
993 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
994 * | 19 205 116 29 64 16 4 1 | / /
995 * | 1 0 0 0 0 0 0 0 | / /
996 * | 0 1 0 0 0 0 0 0 | <--' /
997 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
998 * | 0 0 0 1 0 0 0 0 |
999 * | 0 0 0 0 1 0 0 0 |
1000 * | 0 0 0 0 0 1 0 0 |
1001 * | 0 0 0 0 0 0 1 0 |
1002 * | 0 0 0 0 0 0 0 1 |
1003 * ~~ ~~
1004 * __ __
1005 * | 1 1 1 1 1 1 1 1 |
1006 * | 19 205 116 29 64 16 4 1 |
1007 * | 1 0 0 0 0 0 0 0 |
1008 * (V|I)' = | 0 0 0 1 0 0 0 0 |
1009 * | 0 0 0 0 1 0 0 0 |
1010 * | 0 0 0 0 0 1 0 0 |
1011 * | 0 0 0 0 0 0 1 0 |
1012 * | 0 0 0 0 0 0 0 1 |
1013 * ~~ ~~
1014 *
1015 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1016 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1017 * matrix is not singular.
1018 * __ __
1019 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1020 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1021 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1022 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1023 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1024 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1025 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1026 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1027 * ~~ ~~
1028 * __ __
1029 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1030 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1031 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1032 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1033 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1034 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1035 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1036 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1037 * ~~ ~~
1038 * __ __
1039 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1040 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1041 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1042 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1043 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1044 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1045 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1046 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1047 * ~~ ~~
1048 * __ __
1049 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1050 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1051 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1052 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1053 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1054 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1055 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1056 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1057 * ~~ ~~
1058 * __ __
1059 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1060 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1061 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1062 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1063 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1064 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1065 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1066 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1067 * ~~ ~~
1068 * __ __
1069 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1070 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1071 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1072 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1073 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1074 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1075 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1076 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1077 * ~~ ~~
1078 * __ __
1079 * | 0 0 1 0 0 0 0 0 |
1080 * | 167 100 5 41 159 169 217 208 |
1081 * | 166 100 4 40 158 168 216 209 |
1082 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1083 * | 0 0 0 0 1 0 0 0 |
1084 * | 0 0 0 0 0 1 0 0 |
1085 * | 0 0 0 0 0 0 1 0 |
1086 * | 0 0 0 0 0 0 0 1 |
1087 * ~~ ~~
1088 *
1089 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1090 * of the missing data.
1091 *
1092 * As is apparent from the example above, the only non-trivial rows in the
1093 * inverse matrix correspond to the data disks that we're trying to
1094 * reconstruct. Indeed, those are the only rows we need as the others would
1095 * only be useful for reconstructing data known or assumed to be valid. For
1096 * that reason, we only build the coefficients in the rows that correspond to
1097 * targeted columns.
1098 */
1099 /* END CSTYLED */
1100
1101 static void
vdev_raidz_matrix_init(raidz_map_t * rm,int n,int nmap,int * map,uint8_t ** rows)1102 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1103 uint8_t **rows)
1104 {
1105 int i, j;
1106 int pow;
1107
1108 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1109
1110 /*
1111 * Fill in the missing rows of interest.
1112 */
1113 for (i = 0; i < nmap; i++) {
1114 ASSERT3S(0, <=, map[i]);
1115 ASSERT3S(map[i], <=, 2);
1116
1117 pow = map[i] * n;
1118 if (pow > 255)
1119 pow -= 255;
1120 ASSERT(pow <= 255);
1121
1122 for (j = 0; j < n; j++) {
1123 pow -= map[i];
1124 if (pow < 0)
1125 pow += 255;
1126 rows[i][j] = vdev_raidz_pow2[pow];
1127 }
1128 }
1129 }
1130
1131 static void
vdev_raidz_matrix_invert(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** rows,uint8_t ** invrows,const uint8_t * used)1132 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1133 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1134 {
1135 int i, j, ii, jj;
1136 uint8_t log;
1137
1138 /*
1139 * Assert that the first nmissing entries from the array of used
1140 * columns correspond to parity columns and that subsequent entries
1141 * correspond to data columns.
1142 */
1143 for (i = 0; i < nmissing; i++) {
1144 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1145 }
1146 for (; i < n; i++) {
1147 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1148 }
1149
1150 /*
1151 * First initialize the storage where we'll compute the inverse rows.
1152 */
1153 for (i = 0; i < nmissing; i++) {
1154 for (j = 0; j < n; j++) {
1155 invrows[i][j] = (i == j) ? 1 : 0;
1156 }
1157 }
1158
1159 /*
1160 * Subtract all trivial rows from the rows of consequence.
1161 */
1162 for (i = 0; i < nmissing; i++) {
1163 for (j = nmissing; j < n; j++) {
1164 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1165 jj = used[j] - rm->rm_firstdatacol;
1166 ASSERT3S(jj, <, n);
1167 invrows[i][j] = rows[i][jj];
1168 rows[i][jj] = 0;
1169 }
1170 }
1171
1172 /*
1173 * For each of the rows of interest, we must normalize it and subtract
1174 * a multiple of it from the other rows.
1175 */
1176 for (i = 0; i < nmissing; i++) {
1177 for (j = 0; j < missing[i]; j++) {
1178 ASSERT0(rows[i][j]);
1179 }
1180 ASSERT3U(rows[i][missing[i]], !=, 0);
1181
1182 /*
1183 * Compute the inverse of the first element and multiply each
1184 * element in the row by that value.
1185 */
1186 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1187
1188 for (j = 0; j < n; j++) {
1189 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1190 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1191 }
1192
1193 for (ii = 0; ii < nmissing; ii++) {
1194 if (i == ii)
1195 continue;
1196
1197 ASSERT3U(rows[ii][missing[i]], !=, 0);
1198
1199 log = vdev_raidz_log2[rows[ii][missing[i]]];
1200
1201 for (j = 0; j < n; j++) {
1202 rows[ii][j] ^=
1203 vdev_raidz_exp2(rows[i][j], log);
1204 invrows[ii][j] ^=
1205 vdev_raidz_exp2(invrows[i][j], log);
1206 }
1207 }
1208 }
1209
1210 /*
1211 * Verify that the data that is left in the rows are properly part of
1212 * an identity matrix.
1213 */
1214 for (i = 0; i < nmissing; i++) {
1215 for (j = 0; j < n; j++) {
1216 if (j == missing[i]) {
1217 ASSERT3U(rows[i][j], ==, 1);
1218 } else {
1219 ASSERT0(rows[i][j]);
1220 }
1221 }
1222 }
1223 }
1224
1225 static void
vdev_raidz_matrix_reconstruct(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** invrows,const uint8_t * used)1226 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1227 int *missing, uint8_t **invrows, const uint8_t *used)
1228 {
1229 int i, j, x, cc, c;
1230 uint8_t *src;
1231 uint64_t ccount;
1232 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1233 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1234 uint8_t log = 0;
1235 uint8_t val;
1236 int ll;
1237 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1238 uint8_t *p, *pp;
1239 size_t psize;
1240
1241 psize = sizeof (invlog[0][0]) * n * nmissing;
1242 p = kmem_alloc(psize, KM_SLEEP);
1243
1244 for (pp = p, i = 0; i < nmissing; i++) {
1245 invlog[i] = pp;
1246 pp += n;
1247 }
1248
1249 for (i = 0; i < nmissing; i++) {
1250 for (j = 0; j < n; j++) {
1251 ASSERT3U(invrows[i][j], !=, 0);
1252 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1253 }
1254 }
1255
1256 for (i = 0; i < n; i++) {
1257 c = used[i];
1258 ASSERT3U(c, <, rm->rm_cols);
1259
1260 src = rm->rm_col[c].rc_data;
1261 ccount = rm->rm_col[c].rc_size;
1262 for (j = 0; j < nmissing; j++) {
1263 cc = missing[j] + rm->rm_firstdatacol;
1264 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1265 ASSERT3U(cc, <, rm->rm_cols);
1266 ASSERT3U(cc, !=, c);
1267
1268 dst[j] = rm->rm_col[cc].rc_data;
1269 dcount[j] = rm->rm_col[cc].rc_size;
1270 }
1271
1272 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1273
1274 for (x = 0; x < ccount; x++, src++) {
1275 if (*src != 0)
1276 log = vdev_raidz_log2[*src];
1277
1278 for (cc = 0; cc < nmissing; cc++) {
1279 if (x >= dcount[cc])
1280 continue;
1281
1282 if (*src == 0) {
1283 val = 0;
1284 } else {
1285 if ((ll = log + invlog[cc][i]) >= 255)
1286 ll -= 255;
1287 val = vdev_raidz_pow2[ll];
1288 }
1289
1290 if (i == 0)
1291 dst[cc][x] = val;
1292 else
1293 dst[cc][x] ^= val;
1294 }
1295 }
1296 }
1297
1298 kmem_free(p, psize);
1299 }
1300
1301 static int
vdev_raidz_reconstruct_general(raidz_map_t * rm,int * tgts,int ntgts)1302 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1303 {
1304 int n, i, c, t, tt;
1305 int nmissing_rows;
1306 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1307 int parity_map[VDEV_RAIDZ_MAXPARITY];
1308
1309 uint8_t *p, *pp;
1310 size_t psize;
1311
1312 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1313 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1314 uint8_t *used;
1315
1316 int code = 0;
1317
1318
1319 n = rm->rm_cols - rm->rm_firstdatacol;
1320
1321 /*
1322 * Figure out which data columns are missing.
1323 */
1324 nmissing_rows = 0;
1325 for (t = 0; t < ntgts; t++) {
1326 if (tgts[t] >= rm->rm_firstdatacol) {
1327 missing_rows[nmissing_rows++] =
1328 tgts[t] - rm->rm_firstdatacol;
1329 }
1330 }
1331
1332 /*
1333 * Figure out which parity columns to use to help generate the missing
1334 * data columns.
1335 */
1336 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1337 ASSERT(tt < ntgts);
1338 ASSERT(c < rm->rm_firstdatacol);
1339
1340 /*
1341 * Skip any targeted parity columns.
1342 */
1343 if (c == tgts[tt]) {
1344 tt++;
1345 continue;
1346 }
1347
1348 code |= 1 << c;
1349
1350 parity_map[i] = c;
1351 i++;
1352 }
1353
1354 ASSERT(code != 0);
1355 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1356
1357 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1358 nmissing_rows * n + sizeof (used[0]) * n;
1359 p = kmem_alloc(psize, KM_SLEEP);
1360
1361 for (pp = p, i = 0; i < nmissing_rows; i++) {
1362 rows[i] = pp;
1363 pp += n;
1364 invrows[i] = pp;
1365 pp += n;
1366 }
1367 used = pp;
1368
1369 for (i = 0; i < nmissing_rows; i++) {
1370 used[i] = parity_map[i];
1371 }
1372
1373 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1374 if (tt < nmissing_rows &&
1375 c == missing_rows[tt] + rm->rm_firstdatacol) {
1376 tt++;
1377 continue;
1378 }
1379
1380 ASSERT3S(i, <, n);
1381 used[i] = c;
1382 i++;
1383 }
1384
1385 /*
1386 * Initialize the interesting rows of the matrix.
1387 */
1388 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1389
1390 /*
1391 * Invert the matrix.
1392 */
1393 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1394 invrows, used);
1395
1396 /*
1397 * Reconstruct the missing data using the generated matrix.
1398 */
1399 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1400 invrows, used);
1401
1402 kmem_free(p, psize);
1403
1404 return (code);
1405 }
1406
1407 static int
vdev_raidz_reconstruct(raidz_map_t * rm,int * t,int nt)1408 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1409 {
1410 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1411 int ntgts;
1412 int i, c;
1413 int code;
1414 int nbadparity, nbaddata;
1415 int parity_valid[VDEV_RAIDZ_MAXPARITY] = {0};
1416
1417 /*
1418 * The tgts list must already be sorted.
1419 */
1420 for (i = 1; i < nt; i++) {
1421 ASSERT(t[i] > t[i - 1]);
1422 }
1423
1424 nbadparity = rm->rm_firstdatacol;
1425 nbaddata = rm->rm_cols - nbadparity;
1426 ntgts = 0;
1427 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1428 if (c < rm->rm_firstdatacol)
1429 parity_valid[c] = B_FALSE;
1430
1431 if (i < nt && c == t[i]) {
1432 tgts[ntgts++] = c;
1433 i++;
1434 } else if (rm->rm_col[c].rc_error != 0) {
1435 tgts[ntgts++] = c;
1436 } else if (c >= rm->rm_firstdatacol) {
1437 nbaddata--;
1438 } else {
1439 parity_valid[c] = B_TRUE;
1440 nbadparity--;
1441 }
1442 }
1443
1444 ASSERT(ntgts >= nt);
1445 ASSERT(nbaddata >= 0);
1446 ASSERT(nbaddata + nbadparity == ntgts);
1447
1448 dt = &tgts[nbadparity];
1449
1450 /*
1451 * See if we can use any of our optimized reconstruction routines.
1452 */
1453 if (!vdev_raidz_default_to_general) {
1454 switch (nbaddata) {
1455 case 1:
1456 if (parity_valid[VDEV_RAIDZ_P])
1457 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1458
1459 ASSERT(rm->rm_firstdatacol > 1);
1460
1461 if (parity_valid[VDEV_RAIDZ_Q])
1462 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1463
1464 ASSERT(rm->rm_firstdatacol > 2);
1465 break;
1466
1467 case 2:
1468 ASSERT(rm->rm_firstdatacol > 1);
1469
1470 if (parity_valid[VDEV_RAIDZ_P] &&
1471 parity_valid[VDEV_RAIDZ_Q])
1472 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1473
1474 ASSERT(rm->rm_firstdatacol > 2);
1475
1476 break;
1477 }
1478 }
1479
1480 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1481 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1482 ASSERT(code > 0);
1483 return (code);
1484 }
1485
1486 static int
vdev_raidz_open(vdev_t * vd,uint64_t * asize,uint64_t * max_asize,uint64_t * logical_ashift,uint64_t * physical_ashift)1487 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1488 uint64_t *logical_ashift, uint64_t *physical_ashift)
1489 {
1490 vdev_t *cvd;
1491 uint64_t nparity = vd->vdev_nparity;
1492 int c;
1493 int lasterror = 0;
1494 int numerrors = 0;
1495
1496 ASSERT(nparity > 0);
1497
1498 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1499 vd->vdev_children < nparity + 1) {
1500 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1501 return (SET_ERROR(EINVAL));
1502 }
1503
1504 vdev_open_children(vd);
1505
1506 for (c = 0; c < vd->vdev_children; c++) {
1507 cvd = vd->vdev_child[c];
1508
1509 if (cvd->vdev_open_error != 0) {
1510 lasterror = cvd->vdev_open_error;
1511 numerrors++;
1512 continue;
1513 }
1514
1515 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1516 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1517 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1518 *physical_ashift = MAX(*physical_ashift,
1519 cvd->vdev_physical_ashift);
1520 }
1521
1522 *asize *= vd->vdev_children;
1523 *max_asize *= vd->vdev_children;
1524
1525 if (numerrors > nparity) {
1526 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1527 return (lasterror);
1528 }
1529
1530 return (0);
1531 }
1532
1533 static void
vdev_raidz_close(vdev_t * vd)1534 vdev_raidz_close(vdev_t *vd)
1535 {
1536 int c;
1537
1538 for (c = 0; c < vd->vdev_children; c++)
1539 vdev_close(vd->vdev_child[c]);
1540 }
1541
1542 #ifdef illumos
1543 /*
1544 * Handle a read or write I/O to a RAID-Z dump device.
1545 *
1546 * The dump device is in a unique situation compared to other ZFS datasets:
1547 * writing to this device should be as simple and fast as possible. In
1548 * addition, durability matters much less since the dump will be extracted
1549 * once the machine reboots. For that reason, this function eschews parity for
1550 * performance and simplicity. The dump device uses the checksum setting
1551 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1552 * dataset.
1553 *
1554 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than
1555 * 128 KB will not fill an entire block; in addition, they may not be properly
1556 * aligned. In that case, this function uses the preallocated 128 KB block and
1557 * omits reading or writing any "empty" portions of that block, as opposed to
1558 * allocating a fresh appropriately-sized block.
1559 *
1560 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1561 *
1562 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1563 *
1564 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1565 * allocated which spans all five child vdevs. 8 KB of data would be written to
1566 * each of four vdevs, with the fifth containing the parity bits.
1567 *
1568 * parity data data data data
1569 * | PP | XX | XX | XX | XX |
1570 * ^ ^ ^ ^ ^
1571 * | | | | |
1572 * 8 KB parity ------8 KB data blocks------
1573 *
1574 * However, when writing to the dump device, the behavior is different:
1575 *
1576 * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1577 *
1578 * Unlike the normal RAID-Z case in which the block is allocated based on the
1579 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the
1580 * I/O size is less than 128 KB, only the actual portions of data are written.
1581 * In this example the data is written to the third data vdev since that vdev
1582 * contains the offset [64 KB, 96 KB).
1583 *
1584 * parity data data data data
1585 * | | | | XX | |
1586 * ^
1587 * |
1588 * 32 KB data block
1589 *
1590 * As a result, an individual I/O may not span all child vdevs; moreover, a
1591 * small I/O may only operate on a single child vdev.
1592 *
1593 * Note that since there are no parity bits calculated or written, this format
1594 * remains the same no matter how many parity bits are used in a normal RAID-Z
1595 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above
1596 * would look like:
1597 *
1598 * parity parity parity data data data data
1599 * | | | | | | XX | |
1600 * ^
1601 * |
1602 * 32 KB data block
1603 */
1604 int
vdev_raidz_physio(vdev_t * vd,caddr_t data,size_t size,uint64_t offset,uint64_t origoffset,boolean_t doread,boolean_t isdump)1605 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1606 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1607 {
1608 vdev_t *tvd = vd->vdev_top;
1609 vdev_t *cvd;
1610 raidz_map_t *rm;
1611 raidz_col_t *rc;
1612 int c, err = 0;
1613
1614 uint64_t start, end, colstart, colend;
1615 uint64_t coloffset, colsize, colskip;
1616
1617 int flags = doread ? BIO_READ : BIO_WRITE;
1618
1619 #ifdef _KERNEL
1620
1621 /*
1622 * Don't write past the end of the block
1623 */
1624 VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1625
1626 start = offset;
1627 end = start + size;
1628
1629 /*
1630 * Allocate a RAID-Z map for this block. Note that this block starts
1631 * from the "original" offset, this is, the offset of the extent which
1632 * contains the requisite offset of the data being read or written.
1633 *
1634 * Even if this I/O operation doesn't span the full block size, let's
1635 * treat the on-disk format as if the only blocks are the complete 128
1636 * KB size.
1637 */
1638 rm = vdev_raidz_map_alloc(data - (offset - origoffset),
1639 SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
1640 vd->vdev_children, vd->vdev_nparity);
1641
1642 coloffset = origoffset;
1643
1644 for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1645 c++, coloffset += rc->rc_size) {
1646 rc = &rm->rm_col[c];
1647 cvd = vd->vdev_child[rc->rc_devidx];
1648
1649 /*
1650 * Find the start and end of this column in the RAID-Z map,
1651 * keeping in mind that the stated size and offset of the
1652 * operation may not fill the entire column for this vdev.
1653 *
1654 * If any portion of the data spans this column, issue the
1655 * appropriate operation to the vdev.
1656 */
1657 if (coloffset + rc->rc_size <= start)
1658 continue;
1659 if (coloffset >= end)
1660 continue;
1661
1662 colstart = MAX(coloffset, start);
1663 colend = MIN(end, coloffset + rc->rc_size);
1664 colsize = colend - colstart;
1665 colskip = colstart - coloffset;
1666
1667 VERIFY3U(colsize, <=, rc->rc_size);
1668 VERIFY3U(colskip, <=, rc->rc_size);
1669
1670 /*
1671 * Note that the child vdev will have a vdev label at the start
1672 * of its range of offsets, hence the need for
1673 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another
1674 * example of why this calculation is needed.
1675 */
1676 if ((err = vdev_disk_physio(cvd,
1677 ((char *)rc->rc_data) + colskip, colsize,
1678 VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1679 flags, isdump)) != 0)
1680 break;
1681 }
1682
1683 vdev_raidz_map_free(rm);
1684 #endif /* KERNEL */
1685
1686 return (err);
1687 }
1688 #endif
1689
1690 static uint64_t
vdev_raidz_asize(vdev_t * vd,uint64_t psize)1691 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1692 {
1693 uint64_t asize;
1694 uint64_t ashift = vd->vdev_top->vdev_ashift;
1695 uint64_t cols = vd->vdev_children;
1696 uint64_t nparity = vd->vdev_nparity;
1697
1698 asize = ((psize - 1) >> ashift) + 1;
1699 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1700 asize = roundup(asize, nparity + 1) << ashift;
1701
1702 return (asize);
1703 }
1704
1705 static void
vdev_raidz_child_done(zio_t * zio)1706 vdev_raidz_child_done(zio_t *zio)
1707 {
1708 raidz_col_t *rc = zio->io_private;
1709
1710 rc->rc_error = zio->io_error;
1711 rc->rc_tried = 1;
1712 rc->rc_skipped = 0;
1713 }
1714
1715 /*
1716 * Start an IO operation on a RAIDZ VDev
1717 *
1718 * Outline:
1719 * - For write operations:
1720 * 1. Generate the parity data
1721 * 2. Create child zio write operations to each column's vdev, for both
1722 * data and parity.
1723 * 3. If the column skips any sectors for padding, create optional dummy
1724 * write zio children for those areas to improve aggregation continuity.
1725 * - For read operations:
1726 * 1. Create child zio read operations to each data column's vdev to read
1727 * the range of data required for zio.
1728 * 2. If this is a scrub or resilver operation, or if any of the data
1729 * vdevs have had errors, then create zio read operations to the parity
1730 * columns' VDevs as well.
1731 */
1732 static void
vdev_raidz_io_start(zio_t * zio)1733 vdev_raidz_io_start(zio_t *zio)
1734 {
1735 vdev_t *vd = zio->io_vd;
1736 vdev_t *tvd = vd->vdev_top;
1737 vdev_t *cvd;
1738 raidz_map_t *rm;
1739 raidz_col_t *rc;
1740 int c, i;
1741
1742 rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
1743 zio->io_type == ZIO_TYPE_FREE,
1744 tvd->vdev_ashift, vd->vdev_children,
1745 vd->vdev_nparity);
1746
1747 zio->io_vsd = rm;
1748 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1749
1750 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1751
1752 if (zio->io_type == ZIO_TYPE_FREE) {
1753 for (c = 0; c < rm->rm_cols; c++) {
1754 rc = &rm->rm_col[c];
1755 cvd = vd->vdev_child[rc->rc_devidx];
1756 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1757 rc->rc_offset, rc->rc_data, rc->rc_size,
1758 zio->io_type, zio->io_priority, 0,
1759 vdev_raidz_child_done, rc));
1760 }
1761
1762 zio_execute(zio);
1763 return;
1764 }
1765
1766 if (zio->io_type == ZIO_TYPE_WRITE) {
1767 vdev_raidz_generate_parity(rm);
1768
1769 for (c = 0; c < rm->rm_cols; c++) {
1770 rc = &rm->rm_col[c];
1771 cvd = vd->vdev_child[rc->rc_devidx];
1772 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1773 rc->rc_offset, rc->rc_data, rc->rc_size,
1774 zio->io_type, zio->io_priority, 0,
1775 vdev_raidz_child_done, rc));
1776 }
1777
1778 /*
1779 * Generate optional I/Os for any skipped sectors to improve
1780 * aggregation contiguity.
1781 */
1782 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1783 ASSERT(c <= rm->rm_scols);
1784 if (c == rm->rm_scols)
1785 c = 0;
1786 rc = &rm->rm_col[c];
1787 cvd = vd->vdev_child[rc->rc_devidx];
1788 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1789 rc->rc_offset + rc->rc_size, NULL,
1790 1 << tvd->vdev_ashift,
1791 zio->io_type, zio->io_priority,
1792 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1793 }
1794
1795 zio_execute(zio);
1796 return;
1797 }
1798
1799 ASSERT(zio->io_type == ZIO_TYPE_READ);
1800
1801 /*
1802 * Iterate over the columns in reverse order so that we hit the parity
1803 * last -- any errors along the way will force us to read the parity.
1804 */
1805 for (c = rm->rm_cols - 1; c >= 0; c--) {
1806 rc = &rm->rm_col[c];
1807 cvd = vd->vdev_child[rc->rc_devidx];
1808 if (!vdev_readable(cvd)) {
1809 if (c >= rm->rm_firstdatacol)
1810 rm->rm_missingdata++;
1811 else
1812 rm->rm_missingparity++;
1813 rc->rc_error = SET_ERROR(ENXIO);
1814 rc->rc_tried = 1; /* don't even try */
1815 rc->rc_skipped = 1;
1816 continue;
1817 }
1818 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1819 if (c >= rm->rm_firstdatacol)
1820 rm->rm_missingdata++;
1821 else
1822 rm->rm_missingparity++;
1823 rc->rc_error = SET_ERROR(ESTALE);
1824 rc->rc_skipped = 1;
1825 continue;
1826 }
1827 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1828 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1829 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1830 rc->rc_offset, rc->rc_data, rc->rc_size,
1831 zio->io_type, zio->io_priority, 0,
1832 vdev_raidz_child_done, rc));
1833 }
1834 }
1835
1836 zio_execute(zio);
1837 }
1838
1839
1840 /*
1841 * Report a checksum error for a child of a RAID-Z device.
1842 */
1843 static void
raidz_checksum_error(zio_t * zio,raidz_col_t * rc,void * bad_data)1844 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1845 {
1846 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1847
1848 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1849 zio_bad_cksum_t zbc;
1850 raidz_map_t *rm = zio->io_vsd;
1851
1852 mutex_enter(&vd->vdev_stat_lock);
1853 vd->vdev_stat.vs_checksum_errors++;
1854 mutex_exit(&vd->vdev_stat_lock);
1855
1856 zbc.zbc_has_cksum = 0;
1857 zbc.zbc_injected = rm->rm_ecksuminjected;
1858
1859 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1860 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1861 &zbc);
1862 }
1863 }
1864
1865 /*
1866 * We keep track of whether or not there were any injected errors, so that
1867 * any ereports we generate can note it.
1868 */
1869 static int
raidz_checksum_verify(zio_t * zio)1870 raidz_checksum_verify(zio_t *zio)
1871 {
1872 zio_bad_cksum_t zbc;
1873 raidz_map_t *rm = zio->io_vsd;
1874
1875 int ret = zio_checksum_error(zio, &zbc);
1876 if (ret != 0 && zbc.zbc_injected != 0)
1877 rm->rm_ecksuminjected = 1;
1878
1879 return (ret);
1880 }
1881
1882 /*
1883 * Generate the parity from the data columns. If we tried and were able to
1884 * read the parity without error, verify that the generated parity matches the
1885 * data we read. If it doesn't, we fire off a checksum error. Return the
1886 * number such failures.
1887 */
1888 static int
raidz_parity_verify(zio_t * zio,raidz_map_t * rm)1889 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1890 {
1891 void *orig[VDEV_RAIDZ_MAXPARITY];
1892 int c, ret = 0;
1893 raidz_col_t *rc;
1894
1895 blkptr_t *bp = zio->io_bp;
1896 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1897 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1898
1899 if (checksum == ZIO_CHECKSUM_NOPARITY)
1900 return (ret);
1901
1902 for (c = 0; c < rm->rm_firstdatacol; c++) {
1903 rc = &rm->rm_col[c];
1904 if (!rc->rc_tried || rc->rc_error != 0)
1905 continue;
1906 orig[c] = zio_buf_alloc(rc->rc_size);
1907 bcopy(rc->rc_data, orig[c], rc->rc_size);
1908 }
1909
1910 vdev_raidz_generate_parity(rm);
1911
1912 for (c = 0; c < rm->rm_firstdatacol; c++) {
1913 rc = &rm->rm_col[c];
1914 if (!rc->rc_tried || rc->rc_error != 0)
1915 continue;
1916 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1917 raidz_checksum_error(zio, rc, orig[c]);
1918 rc->rc_error = SET_ERROR(ECKSUM);
1919 ret++;
1920 }
1921 zio_buf_free(orig[c], rc->rc_size);
1922 }
1923
1924 return (ret);
1925 }
1926
1927 /*
1928 * Keep statistics on all the ways that we used parity to correct data.
1929 */
1930 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1931
1932 static int
vdev_raidz_worst_error(raidz_map_t * rm)1933 vdev_raidz_worst_error(raidz_map_t *rm)
1934 {
1935 int error = 0;
1936
1937 for (int c = 0; c < rm->rm_cols; c++)
1938 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1939
1940 return (error);
1941 }
1942
1943 /*
1944 * Iterate over all combinations of bad data and attempt a reconstruction.
1945 * Note that the algorithm below is non-optimal because it doesn't take into
1946 * account how reconstruction is actually performed. For example, with
1947 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1948 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1949 * cases we'd only use parity information in column 0.
1950 */
1951 static int
vdev_raidz_combrec(zio_t * zio,int total_errors,int data_errors)1952 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1953 {
1954 raidz_map_t *rm = zio->io_vsd;
1955 raidz_col_t *rc;
1956 void *orig[VDEV_RAIDZ_MAXPARITY];
1957 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1958 int *tgts = &tstore[1];
1959 int current, next, i, c, n;
1960 int code, ret = 0;
1961
1962 ASSERT(total_errors < rm->rm_firstdatacol);
1963
1964 /*
1965 * This simplifies one edge condition.
1966 */
1967 tgts[-1] = -1;
1968
1969 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1970 /*
1971 * Initialize the targets array by finding the first n columns
1972 * that contain no error.
1973 *
1974 * If there were no data errors, we need to ensure that we're
1975 * always explicitly attempting to reconstruct at least one
1976 * data column. To do this, we simply push the highest target
1977 * up into the data columns.
1978 */
1979 for (c = 0, i = 0; i < n; i++) {
1980 if (i == n - 1 && data_errors == 0 &&
1981 c < rm->rm_firstdatacol) {
1982 c = rm->rm_firstdatacol;
1983 }
1984
1985 while (rm->rm_col[c].rc_error != 0) {
1986 c++;
1987 ASSERT3S(c, <, rm->rm_cols);
1988 }
1989
1990 tgts[i] = c++;
1991 }
1992
1993 /*
1994 * Setting tgts[n] simplifies the other edge condition.
1995 */
1996 tgts[n] = rm->rm_cols;
1997
1998 /*
1999 * These buffers were allocated in previous iterations.
2000 */
2001 for (i = 0; i < n - 1; i++) {
2002 ASSERT(orig[i] != NULL);
2003 }
2004
2005 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2006
2007 current = 0;
2008 next = tgts[current];
2009
2010 while (current != n) {
2011 tgts[current] = next;
2012 current = 0;
2013
2014 /*
2015 * Save off the original data that we're going to
2016 * attempt to reconstruct.
2017 */
2018 for (i = 0; i < n; i++) {
2019 ASSERT(orig[i] != NULL);
2020 c = tgts[i];
2021 ASSERT3S(c, >=, 0);
2022 ASSERT3S(c, <, rm->rm_cols);
2023 rc = &rm->rm_col[c];
2024 bcopy(rc->rc_data, orig[i], rc->rc_size);
2025 }
2026
2027 /*
2028 * Attempt a reconstruction and exit the outer loop on
2029 * success.
2030 */
2031 code = vdev_raidz_reconstruct(rm, tgts, n);
2032 if (raidz_checksum_verify(zio) == 0) {
2033 atomic_inc_64(&raidz_corrected[code]);
2034
2035 for (i = 0; i < n; i++) {
2036 c = tgts[i];
2037 rc = &rm->rm_col[c];
2038 ASSERT(rc->rc_error == 0);
2039 if (rc->rc_tried)
2040 raidz_checksum_error(zio, rc,
2041 orig[i]);
2042 rc->rc_error = SET_ERROR(ECKSUM);
2043 }
2044
2045 ret = code;
2046 goto done;
2047 }
2048
2049 /*
2050 * Restore the original data.
2051 */
2052 for (i = 0; i < n; i++) {
2053 c = tgts[i];
2054 rc = &rm->rm_col[c];
2055 bcopy(orig[i], rc->rc_data, rc->rc_size);
2056 }
2057
2058 do {
2059 /*
2060 * Find the next valid column after the current
2061 * position..
2062 */
2063 for (next = tgts[current] + 1;
2064 next < rm->rm_cols &&
2065 rm->rm_col[next].rc_error != 0; next++)
2066 continue;
2067
2068 ASSERT(next <= tgts[current + 1]);
2069
2070 /*
2071 * If that spot is available, we're done here.
2072 */
2073 if (next != tgts[current + 1])
2074 break;
2075
2076 /*
2077 * Otherwise, find the next valid column after
2078 * the previous position.
2079 */
2080 for (c = tgts[current - 1] + 1;
2081 rm->rm_col[c].rc_error != 0; c++)
2082 continue;
2083
2084 tgts[current] = c;
2085 current++;
2086
2087 } while (current != n);
2088 }
2089 }
2090 n--;
2091 done:
2092 for (i = 0; i < n; i++) {
2093 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2094 }
2095
2096 return (ret);
2097 }
2098
2099 /*
2100 * Complete an IO operation on a RAIDZ VDev
2101 *
2102 * Outline:
2103 * - For write operations:
2104 * 1. Check for errors on the child IOs.
2105 * 2. Return, setting an error code if too few child VDevs were written
2106 * to reconstruct the data later. Note that partial writes are
2107 * considered successful if they can be reconstructed at all.
2108 * - For read operations:
2109 * 1. Check for errors on the child IOs.
2110 * 2. If data errors occurred:
2111 * a. Try to reassemble the data from the parity available.
2112 * b. If we haven't yet read the parity drives, read them now.
2113 * c. If all parity drives have been read but the data still doesn't
2114 * reassemble with a correct checksum, then try combinatorial
2115 * reconstruction.
2116 * d. If that doesn't work, return an error.
2117 * 3. If there were unexpected errors or this is a resilver operation,
2118 * rewrite the vdevs that had errors.
2119 */
2120 static void
vdev_raidz_io_done(zio_t * zio)2121 vdev_raidz_io_done(zio_t *zio)
2122 {
2123 vdev_t *vd = zio->io_vd;
2124 vdev_t *cvd;
2125 raidz_map_t *rm = zio->io_vsd;
2126 raidz_col_t *rc;
2127 int unexpected_errors = 0;
2128 int parity_errors = 0;
2129 int parity_untried = 0;
2130 int data_errors = 0;
2131 int total_errors = 0;
2132 int n, c;
2133 int tgts[VDEV_RAIDZ_MAXPARITY];
2134 int code;
2135
2136 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
2137
2138 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2139 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2140
2141 for (c = 0; c < rm->rm_cols; c++) {
2142 rc = &rm->rm_col[c];
2143
2144 if (rc->rc_error) {
2145 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
2146
2147 if (c < rm->rm_firstdatacol)
2148 parity_errors++;
2149 else
2150 data_errors++;
2151
2152 if (!rc->rc_skipped)
2153 unexpected_errors++;
2154
2155 total_errors++;
2156 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2157 parity_untried++;
2158 }
2159 }
2160
2161 if (zio->io_type == ZIO_TYPE_WRITE) {
2162 /*
2163 * XXX -- for now, treat partial writes as a success.
2164 * (If we couldn't write enough columns to reconstruct
2165 * the data, the I/O failed. Otherwise, good enough.)
2166 *
2167 * Now that we support write reallocation, it would be better
2168 * to treat partial failure as real failure unless there are
2169 * no non-degraded top-level vdevs left, and not update DTLs
2170 * if we intend to reallocate.
2171 */
2172 /* XXPOLICY */
2173 if (total_errors > rm->rm_firstdatacol)
2174 zio->io_error = vdev_raidz_worst_error(rm);
2175
2176 return;
2177 } else if (zio->io_type == ZIO_TYPE_FREE) {
2178 return;
2179 }
2180
2181 ASSERT(zio->io_type == ZIO_TYPE_READ);
2182 /*
2183 * There are three potential phases for a read:
2184 * 1. produce valid data from the columns read
2185 * 2. read all disks and try again
2186 * 3. perform combinatorial reconstruction
2187 *
2188 * Each phase is progressively both more expensive and less likely to
2189 * occur. If we encounter more errors than we can repair or all phases
2190 * fail, we have no choice but to return an error.
2191 */
2192
2193 /*
2194 * If the number of errors we saw was correctable -- less than or equal
2195 * to the number of parity disks read -- attempt to produce data that
2196 * has a valid checksum. Naturally, this case applies in the absence of
2197 * any errors.
2198 */
2199 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2200 if (data_errors == 0) {
2201 if (raidz_checksum_verify(zio) == 0) {
2202 /*
2203 * If we read parity information (unnecessarily
2204 * as it happens since no reconstruction was
2205 * needed) regenerate and verify the parity.
2206 * We also regenerate parity when resilvering
2207 * so we can write it out to the failed device
2208 * later.
2209 */
2210 if (parity_errors + parity_untried <
2211 rm->rm_firstdatacol ||
2212 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2213 n = raidz_parity_verify(zio, rm);
2214 unexpected_errors += n;
2215 ASSERT(parity_errors + n <=
2216 rm->rm_firstdatacol);
2217 }
2218 goto done;
2219 }
2220 } else {
2221 /*
2222 * We either attempt to read all the parity columns or
2223 * none of them. If we didn't try to read parity, we
2224 * wouldn't be here in the correctable case. There must
2225 * also have been fewer parity errors than parity
2226 * columns or, again, we wouldn't be in this code path.
2227 */
2228 ASSERT(parity_untried == 0);
2229 ASSERT(parity_errors < rm->rm_firstdatacol);
2230
2231 /*
2232 * Identify the data columns that reported an error.
2233 */
2234 n = 0;
2235 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2236 rc = &rm->rm_col[c];
2237 if (rc->rc_error != 0) {
2238 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2239 tgts[n++] = c;
2240 }
2241 }
2242
2243 ASSERT(rm->rm_firstdatacol >= n);
2244
2245 code = vdev_raidz_reconstruct(rm, tgts, n);
2246
2247 if (raidz_checksum_verify(zio) == 0) {
2248 atomic_inc_64(&raidz_corrected[code]);
2249
2250 /*
2251 * If we read more parity disks than were used
2252 * for reconstruction, confirm that the other
2253 * parity disks produced correct data. This
2254 * routine is suboptimal in that it regenerates
2255 * the parity that we already used in addition
2256 * to the parity that we're attempting to
2257 * verify, but this should be a relatively
2258 * uncommon case, and can be optimized if it
2259 * becomes a problem. Note that we regenerate
2260 * parity when resilvering so we can write it
2261 * out to failed devices later.
2262 */
2263 if (parity_errors < rm->rm_firstdatacol - n ||
2264 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2265 n = raidz_parity_verify(zio, rm);
2266 unexpected_errors += n;
2267 ASSERT(parity_errors + n <=
2268 rm->rm_firstdatacol);
2269 }
2270
2271 goto done;
2272 }
2273 }
2274 }
2275
2276 /*
2277 * This isn't a typical situation -- either we got a read error or
2278 * a child silently returned bad data. Read every block so we can
2279 * try again with as much data and parity as we can track down. If
2280 * we've already been through once before, all children will be marked
2281 * as tried so we'll proceed to combinatorial reconstruction.
2282 */
2283 unexpected_errors = 1;
2284 rm->rm_missingdata = 0;
2285 rm->rm_missingparity = 0;
2286
2287 for (c = 0; c < rm->rm_cols; c++) {
2288 if (rm->rm_col[c].rc_tried)
2289 continue;
2290
2291 zio_vdev_io_redone(zio);
2292 do {
2293 rc = &rm->rm_col[c];
2294 if (rc->rc_tried)
2295 continue;
2296 zio_nowait(zio_vdev_child_io(zio, NULL,
2297 vd->vdev_child[rc->rc_devidx],
2298 rc->rc_offset, rc->rc_data, rc->rc_size,
2299 zio->io_type, zio->io_priority, 0,
2300 vdev_raidz_child_done, rc));
2301 } while (++c < rm->rm_cols);
2302
2303 return;
2304 }
2305
2306 /*
2307 * At this point we've attempted to reconstruct the data given the
2308 * errors we detected, and we've attempted to read all columns. There
2309 * must, therefore, be one or more additional problems -- silent errors
2310 * resulting in invalid data rather than explicit I/O errors resulting
2311 * in absent data. We check if there is enough additional data to
2312 * possibly reconstruct the data and then perform combinatorial
2313 * reconstruction over all possible combinations. If that fails,
2314 * we're cooked.
2315 */
2316 if (total_errors > rm->rm_firstdatacol) {
2317 zio->io_error = vdev_raidz_worst_error(rm);
2318
2319 } else if (total_errors < rm->rm_firstdatacol &&
2320 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2321 /*
2322 * If we didn't use all the available parity for the
2323 * combinatorial reconstruction, verify that the remaining
2324 * parity is correct.
2325 */
2326 if (code != (1 << rm->rm_firstdatacol) - 1)
2327 (void) raidz_parity_verify(zio, rm);
2328 } else {
2329 /*
2330 * We're here because either:
2331 *
2332 * total_errors == rm_first_datacol, or
2333 * vdev_raidz_combrec() failed
2334 *
2335 * In either case, there is enough bad data to prevent
2336 * reconstruction.
2337 *
2338 * Start checksum ereports for all children which haven't
2339 * failed, and the IO wasn't speculative.
2340 */
2341 zio->io_error = SET_ERROR(ECKSUM);
2342
2343 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2344 for (c = 0; c < rm->rm_cols; c++) {
2345 rc = &rm->rm_col[c];
2346 if (rc->rc_error == 0) {
2347 zio_bad_cksum_t zbc;
2348 zbc.zbc_has_cksum = 0;
2349 zbc.zbc_injected =
2350 rm->rm_ecksuminjected;
2351
2352 zfs_ereport_start_checksum(
2353 zio->io_spa,
2354 vd->vdev_child[rc->rc_devidx],
2355 zio, rc->rc_offset, rc->rc_size,
2356 (void *)(uintptr_t)c, &zbc);
2357 }
2358 }
2359 }
2360 }
2361
2362 done:
2363 zio_checksum_verified(zio);
2364
2365 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2366 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2367 /*
2368 * Use the good data we have in hand to repair damaged children.
2369 */
2370 for (c = 0; c < rm->rm_cols; c++) {
2371 rc = &rm->rm_col[c];
2372 cvd = vd->vdev_child[rc->rc_devidx];
2373
2374 if (rc->rc_error == 0)
2375 continue;
2376
2377 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2378 rc->rc_offset, rc->rc_data, rc->rc_size,
2379 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2380 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2381 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2382 }
2383 }
2384 }
2385
2386 static void
vdev_raidz_state_change(vdev_t * vd,int faulted,int degraded)2387 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2388 {
2389 if (faulted > vd->vdev_nparity)
2390 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2391 VDEV_AUX_NO_REPLICAS);
2392 else if (degraded + faulted != 0)
2393 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2394 else
2395 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2396 }
2397
2398 vdev_ops_t vdev_raidz_ops = {
2399 vdev_raidz_open,
2400 vdev_raidz_close,
2401 vdev_raidz_asize,
2402 vdev_raidz_io_start,
2403 vdev_raidz_io_done,
2404 vdev_raidz_state_change,
2405 NULL,
2406 NULL,
2407 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2408 B_FALSE /* not a leaf vdev */
2409 };
2410