1789Sahrens /*
2789Sahrens * CDDL HEADER START
3789Sahrens *
4789Sahrens * The contents of this file are subject to the terms of the
51544Seschrock * Common Development and Distribution License (the "License").
61544Seschrock * You may not use this file except in compliance with the License.
7789Sahrens *
8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9789Sahrens * or http://www.opensolaris.org/os/licensing.
10789Sahrens * See the License for the specific language governing permissions
11789Sahrens * and limitations under the License.
12789Sahrens *
13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each
14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15789Sahrens * If applicable, add the following below this CDDL HEADER, with the
16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying
17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner]
18789Sahrens *
19789Sahrens * CDDL HEADER END
20789Sahrens */
212082Seschrock
22789Sahrens /*
23*12296SLin.Ling@Sun.COM * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24789Sahrens */
25789Sahrens
26789Sahrens #include <sys/zfs_context.h>
27789Sahrens #include <sys/spa.h>
28789Sahrens #include <sys/vdev_impl.h>
29789Sahrens #include <sys/zio.h>
30789Sahrens #include <sys/zio_checksum.h>
31789Sahrens #include <sys/fs/zfs.h>
321544Seschrock #include <sys/fm/fs/zfs.h>
33789Sahrens
34789Sahrens /*
35789Sahrens * Virtual device vector for RAID-Z.
362082Seschrock *
3710105Sadam.leventhal@sun.com * This vdev supports single, double, and triple parity. For single parity,
3810105Sadam.leventhal@sun.com * we use a simple XOR of all the data columns. For double or triple parity,
3910105Sadam.leventhal@sun.com * we use a special case of Reed-Solomon coding. This extends the
4010105Sadam.leventhal@sun.com * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
4110105Sadam.leventhal@sun.com * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
4210105Sadam.leventhal@sun.com * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
4310105Sadam.leventhal@sun.com * former is also based. The latter is designed to provide higher performance
4410105Sadam.leventhal@sun.com * for writes.
4510105Sadam.leventhal@sun.com *
4610105Sadam.leventhal@sun.com * Note that the Plank paper claimed to support arbitrary N+M, but was then
4710105Sadam.leventhal@sun.com * amended six years later identifying a critical flaw that invalidates its
4810105Sadam.leventhal@sun.com * claims. Nevertheless, the technique can be adapted to work for up to
4910105Sadam.leventhal@sun.com * triple parity. For additional parity, the amendment "Note: Correction to
5010105Sadam.leventhal@sun.com * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
5110105Sadam.leventhal@sun.com * is viable, but the additional complexity means that write performance will
5210105Sadam.leventhal@sun.com * suffer.
5310105Sadam.leventhal@sun.com *
5410105Sadam.leventhal@sun.com * All of the methods above operate on a Galois field, defined over the
5510105Sadam.leventhal@sun.com * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
5610105Sadam.leventhal@sun.com * can be expressed with a single byte. Briefly, the operations on the
5710105Sadam.leventhal@sun.com * field are defined as follows:
582082Seschrock *
592082Seschrock * o addition (+) is represented by a bitwise XOR
602082Seschrock * o subtraction (-) is therefore identical to addition: A + B = A - B
612082Seschrock * o multiplication of A by 2 is defined by the following bitwise expression:
622082Seschrock * (A * 2)_7 = A_6
632082Seschrock * (A * 2)_6 = A_5
642082Seschrock * (A * 2)_5 = A_4
652082Seschrock * (A * 2)_4 = A_3 + A_7
662082Seschrock * (A * 2)_3 = A_2 + A_7
672082Seschrock * (A * 2)_2 = A_1 + A_7
682082Seschrock * (A * 2)_1 = A_0
692082Seschrock * (A * 2)_0 = A_7
702082Seschrock *
712082Seschrock * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
7210105Sadam.leventhal@sun.com * As an aside, this multiplication is derived from the error correcting
7310105Sadam.leventhal@sun.com * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
742082Seschrock *
752082Seschrock * Observe that any number in the field (except for 0) can be expressed as a
762082Seschrock * power of 2 -- a generator for the field. We store a table of the powers of
772082Seschrock * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
782082Seschrock * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
7910105Sadam.leventhal@sun.com * than field addition). The inverse of a field element A (A^-1) is therefore
8010105Sadam.leventhal@sun.com * A ^ (255 - 1) = A^254.
812082Seschrock *
8210105Sadam.leventhal@sun.com * The up-to-three parity columns, P, Q, R over several data columns,
8310105Sadam.leventhal@sun.com * D_0, ... D_n-1, can be expressed by field operations:
842082Seschrock *
852082Seschrock * P = D_0 + D_1 + ... + D_n-2 + D_n-1
862082Seschrock * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
872082Seschrock * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
8810105Sadam.leventhal@sun.com * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
8910105Sadam.leventhal@sun.com * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
902082Seschrock *
9110105Sadam.leventhal@sun.com * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
9210105Sadam.leventhal@sun.com * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
9310105Sadam.leventhal@sun.com * independent coefficients. (There are no additional coefficients that have
9410105Sadam.leventhal@sun.com * this property which is why the uncorrected Plank method breaks down.)
9510105Sadam.leventhal@sun.com *
9610105Sadam.leventhal@sun.com * See the reconstruction code below for how P, Q and R can used individually
9710105Sadam.leventhal@sun.com * or in concert to recover missing data columns.
98789Sahrens */
99789Sahrens
100789Sahrens typedef struct raidz_col {
1012082Seschrock uint64_t rc_devidx; /* child device index for I/O */
1022082Seschrock uint64_t rc_offset; /* device offset */
1032082Seschrock uint64_t rc_size; /* I/O size */
1042082Seschrock void *rc_data; /* I/O data */
10510614SJonathan.Adams@Sun.COM void *rc_gdata; /* used to store the "good" version */
1062082Seschrock int rc_error; /* I/O error for this device */
1072082Seschrock uint8_t rc_tried; /* Did we attempt this I/O column? */
1082082Seschrock uint8_t rc_skipped; /* Did we skip this I/O column? */
109789Sahrens } raidz_col_t;
110789Sahrens
111789Sahrens typedef struct raidz_map {
11210105Sadam.leventhal@sun.com uint64_t rm_cols; /* Regular column count */
11310105Sadam.leventhal@sun.com uint64_t rm_scols; /* Count including skipped columns */
1142082Seschrock uint64_t rm_bigcols; /* Number of oversized columns */
1152082Seschrock uint64_t rm_asize; /* Actual total I/O size */
1162082Seschrock uint64_t rm_missingdata; /* Count of missing data devices */
1172082Seschrock uint64_t rm_missingparity; /* Count of missing parity devices */
1182082Seschrock uint64_t rm_firstdatacol; /* First data column/parity count */
11910450Sadam.leventhal@sun.com uint64_t rm_nskip; /* Skipped sectors for padding */
12010450Sadam.leventhal@sun.com uint64_t rm_skipstart; /* Column index of padding start */
12110614SJonathan.Adams@Sun.COM void *rm_datacopy; /* rm_asize-buffer of copied data */
12210614SJonathan.Adams@Sun.COM uintptr_t rm_reports; /* # of referencing checksum reports */
12310614SJonathan.Adams@Sun.COM uint8_t rm_freed; /* map no longer has referencing ZIO */
12410614SJonathan.Adams@Sun.COM uint8_t rm_ecksuminjected; /* checksum error was injected */
1252082Seschrock raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
126789Sahrens } raidz_map_t;
127789Sahrens
1282082Seschrock #define VDEV_RAIDZ_P 0
1292082Seschrock #define VDEV_RAIDZ_Q 1
13010105Sadam.leventhal@sun.com #define VDEV_RAIDZ_R 2
1312082Seschrock
13210105Sadam.leventhal@sun.com #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
13310105Sadam.leventhal@sun.com #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
1342082Seschrock
13510105Sadam.leventhal@sun.com /*
13610105Sadam.leventhal@sun.com * We provide a mechanism to perform the field multiplication operation on a
13710105Sadam.leventhal@sun.com * 64-bit value all at once rather than a byte at a time. This works by
13810105Sadam.leventhal@sun.com * creating a mask from the top bit in each byte and using that to
13910105Sadam.leventhal@sun.com * conditionally apply the XOR of 0x1d.
14010105Sadam.leventhal@sun.com */
14110105Sadam.leventhal@sun.com #define VDEV_RAIDZ_64MUL_2(x, mask) \
14210105Sadam.leventhal@sun.com { \
14310105Sadam.leventhal@sun.com (mask) = (x) & 0x8080808080808080ULL; \
14410105Sadam.leventhal@sun.com (mask) = ((mask) << 1) - ((mask) >> 7); \
14510105Sadam.leventhal@sun.com (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
14610105Sadam.leventhal@sun.com ((mask) & 0x1d1d1d1d1d1d1d1d); \
14710105Sadam.leventhal@sun.com }
14810105Sadam.leventhal@sun.com
14910105Sadam.leventhal@sun.com #define VDEV_RAIDZ_64MUL_4(x, mask) \
15010105Sadam.leventhal@sun.com { \
15110105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2((x), mask); \
15210105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2((x), mask); \
15310105Sadam.leventhal@sun.com }
15410105Sadam.leventhal@sun.com
15510105Sadam.leventhal@sun.com /*
15610105Sadam.leventhal@sun.com * Force reconstruction to use the general purpose method.
15710105Sadam.leventhal@sun.com */
15810105Sadam.leventhal@sun.com int vdev_raidz_default_to_general;
1592082Seschrock
1602082Seschrock /*
1612082Seschrock * These two tables represent powers and logs of 2 in the Galois field defined
1622082Seschrock * above. These values were computed by repeatedly multiplying by 2 as above.
1632082Seschrock */
1642082Seschrock static const uint8_t vdev_raidz_pow2[256] = {
1652082Seschrock 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
1662082Seschrock 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
1672082Seschrock 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
1682082Seschrock 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
1692082Seschrock 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
1702082Seschrock 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
1712082Seschrock 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
1722082Seschrock 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
1732082Seschrock 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
1742082Seschrock 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
1752082Seschrock 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
1762082Seschrock 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
1772082Seschrock 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
1782082Seschrock 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
1792082Seschrock 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
1802082Seschrock 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
1812082Seschrock 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
1822082Seschrock 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
1832082Seschrock 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
1842082Seschrock 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
1852082Seschrock 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
1862082Seschrock 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
1872082Seschrock 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
1882082Seschrock 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
1892082Seschrock 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
1902082Seschrock 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
1912082Seschrock 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
1922082Seschrock 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
1932082Seschrock 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
1942082Seschrock 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
1952082Seschrock 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
1962082Seschrock 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
1972082Seschrock };
1982082Seschrock static const uint8_t vdev_raidz_log2[256] = {
1992082Seschrock 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
2002082Seschrock 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
2012082Seschrock 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
2022082Seschrock 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
2032082Seschrock 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
2042082Seschrock 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
2052082Seschrock 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
2062082Seschrock 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
2072082Seschrock 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
2082082Seschrock 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
2092082Seschrock 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
2102082Seschrock 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
2112082Seschrock 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
2122082Seschrock 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
2132082Seschrock 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
2142082Seschrock 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
2152082Seschrock 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
2162082Seschrock 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
2172082Seschrock 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
2182082Seschrock 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
2192082Seschrock 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
2202082Seschrock 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
2212082Seschrock 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
2222082Seschrock 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
2232082Seschrock 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
2242082Seschrock 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
2252082Seschrock 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
2262082Seschrock 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
2272082Seschrock 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
2282082Seschrock 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
2292082Seschrock 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
2302082Seschrock 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
2312082Seschrock };
2322082Seschrock
23310614SJonathan.Adams@Sun.COM static void vdev_raidz_generate_parity(raidz_map_t *rm);
23410614SJonathan.Adams@Sun.COM
2352082Seschrock /*
2362082Seschrock * Multiply a given number by 2 raised to the given power.
2372082Seschrock */
2382082Seschrock static uint8_t
vdev_raidz_exp2(uint_t a,int exp)2392082Seschrock vdev_raidz_exp2(uint_t a, int exp)
2402082Seschrock {
2412082Seschrock if (a == 0)
2422082Seschrock return (0);
2432082Seschrock
2442082Seschrock ASSERT(exp >= 0);
2452082Seschrock ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
2462082Seschrock
2472082Seschrock exp += vdev_raidz_log2[a];
2482082Seschrock if (exp > 255)
2492082Seschrock exp -= 255;
2502082Seschrock
2512082Seschrock return (vdev_raidz_pow2[exp]);
2522082Seschrock }
2532082Seschrock
2547754SJeff.Bonwick@Sun.COM static void
vdev_raidz_map_free(raidz_map_t * rm)25510614SJonathan.Adams@Sun.COM vdev_raidz_map_free(raidz_map_t *rm)
2567754SJeff.Bonwick@Sun.COM {
2577754SJeff.Bonwick@Sun.COM int c;
25810653SJonathan.Adams@Sun.COM size_t size;
2597754SJeff.Bonwick@Sun.COM
26010614SJonathan.Adams@Sun.COM for (c = 0; c < rm->rm_firstdatacol; c++) {
2617754SJeff.Bonwick@Sun.COM zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
2627754SJeff.Bonwick@Sun.COM
26310614SJonathan.Adams@Sun.COM if (rm->rm_col[c].rc_gdata != NULL)
26410614SJonathan.Adams@Sun.COM zio_buf_free(rm->rm_col[c].rc_gdata,
26510614SJonathan.Adams@Sun.COM rm->rm_col[c].rc_size);
26610614SJonathan.Adams@Sun.COM }
26710614SJonathan.Adams@Sun.COM
26810653SJonathan.Adams@Sun.COM size = 0;
26910653SJonathan.Adams@Sun.COM for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
27010653SJonathan.Adams@Sun.COM size += rm->rm_col[c].rc_size;
27110653SJonathan.Adams@Sun.COM
27210614SJonathan.Adams@Sun.COM if (rm->rm_datacopy != NULL)
27310614SJonathan.Adams@Sun.COM zio_buf_free(rm->rm_datacopy, size);
27410614SJonathan.Adams@Sun.COM
27510105Sadam.leventhal@sun.com kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
2767754SJeff.Bonwick@Sun.COM }
2777754SJeff.Bonwick@Sun.COM
27810614SJonathan.Adams@Sun.COM static void
vdev_raidz_map_free_vsd(zio_t * zio)27910614SJonathan.Adams@Sun.COM vdev_raidz_map_free_vsd(zio_t *zio)
28010614SJonathan.Adams@Sun.COM {
28110614SJonathan.Adams@Sun.COM raidz_map_t *rm = zio->io_vsd;
28210614SJonathan.Adams@Sun.COM
28310614SJonathan.Adams@Sun.COM ASSERT3U(rm->rm_freed, ==, 0);
28410614SJonathan.Adams@Sun.COM rm->rm_freed = 1;
28510614SJonathan.Adams@Sun.COM
28610614SJonathan.Adams@Sun.COM if (rm->rm_reports == 0)
28710614SJonathan.Adams@Sun.COM vdev_raidz_map_free(rm);
28810614SJonathan.Adams@Sun.COM }
28910614SJonathan.Adams@Sun.COM
29010614SJonathan.Adams@Sun.COM /*ARGSUSED*/
29110614SJonathan.Adams@Sun.COM static void
vdev_raidz_cksum_free(void * arg,size_t ignored)29210614SJonathan.Adams@Sun.COM vdev_raidz_cksum_free(void *arg, size_t ignored)
29310614SJonathan.Adams@Sun.COM {
29410614SJonathan.Adams@Sun.COM raidz_map_t *rm = arg;
29510614SJonathan.Adams@Sun.COM
29610614SJonathan.Adams@Sun.COM ASSERT3U(rm->rm_reports, >, 0);
29710614SJonathan.Adams@Sun.COM
29810653SJonathan.Adams@Sun.COM if (--rm->rm_reports == 0 && rm->rm_freed != 0)
29910614SJonathan.Adams@Sun.COM vdev_raidz_map_free(rm);
30010614SJonathan.Adams@Sun.COM }
30110614SJonathan.Adams@Sun.COM
30210614SJonathan.Adams@Sun.COM static void
vdev_raidz_cksum_finish(zio_cksum_report_t * zcr,const void * good_data)30310614SJonathan.Adams@Sun.COM vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
30410614SJonathan.Adams@Sun.COM {
30510614SJonathan.Adams@Sun.COM raidz_map_t *rm = zcr->zcr_cbdata;
30610614SJonathan.Adams@Sun.COM size_t c = zcr->zcr_cbinfo;
30710614SJonathan.Adams@Sun.COM size_t x;
30810614SJonathan.Adams@Sun.COM
30910614SJonathan.Adams@Sun.COM const char *good = NULL;
31010614SJonathan.Adams@Sun.COM const char *bad = rm->rm_col[c].rc_data;
31110614SJonathan.Adams@Sun.COM
31210614SJonathan.Adams@Sun.COM if (good_data == NULL) {
31310614SJonathan.Adams@Sun.COM zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
31410614SJonathan.Adams@Sun.COM return;
31510614SJonathan.Adams@Sun.COM }
31610614SJonathan.Adams@Sun.COM
31710614SJonathan.Adams@Sun.COM if (c < rm->rm_firstdatacol) {
31810614SJonathan.Adams@Sun.COM /*
31910614SJonathan.Adams@Sun.COM * The first time through, calculate the parity blocks for
32010614SJonathan.Adams@Sun.COM * the good data (this relies on the fact that the good
32110614SJonathan.Adams@Sun.COM * data never changes for a given logical ZIO)
32210614SJonathan.Adams@Sun.COM */
32310614SJonathan.Adams@Sun.COM if (rm->rm_col[0].rc_gdata == NULL) {
32410614SJonathan.Adams@Sun.COM char *bad_parity[VDEV_RAIDZ_MAXPARITY];
32510614SJonathan.Adams@Sun.COM char *buf;
32610614SJonathan.Adams@Sun.COM
32710614SJonathan.Adams@Sun.COM /*
32810614SJonathan.Adams@Sun.COM * Set up the rm_col[]s to generate the parity for
32910614SJonathan.Adams@Sun.COM * good_data, first saving the parity bufs and
33010614SJonathan.Adams@Sun.COM * replacing them with buffers to hold the result.
33110614SJonathan.Adams@Sun.COM */
33210614SJonathan.Adams@Sun.COM for (x = 0; x < rm->rm_firstdatacol; x++) {
33310614SJonathan.Adams@Sun.COM bad_parity[x] = rm->rm_col[x].rc_data;
33410614SJonathan.Adams@Sun.COM rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
33510614SJonathan.Adams@Sun.COM zio_buf_alloc(rm->rm_col[x].rc_size);
33610614SJonathan.Adams@Sun.COM }
33710614SJonathan.Adams@Sun.COM
33810614SJonathan.Adams@Sun.COM /* fill in the data columns from good_data */
33910614SJonathan.Adams@Sun.COM buf = (char *)good_data;
34010614SJonathan.Adams@Sun.COM for (; x < rm->rm_cols; x++) {
34110614SJonathan.Adams@Sun.COM rm->rm_col[x].rc_data = buf;
34210614SJonathan.Adams@Sun.COM buf += rm->rm_col[x].rc_size;
34310614SJonathan.Adams@Sun.COM }
34410614SJonathan.Adams@Sun.COM
34510614SJonathan.Adams@Sun.COM /*
34610614SJonathan.Adams@Sun.COM * Construct the parity from the good data.
34710614SJonathan.Adams@Sun.COM */
34810614SJonathan.Adams@Sun.COM vdev_raidz_generate_parity(rm);
34910614SJonathan.Adams@Sun.COM
35010614SJonathan.Adams@Sun.COM /* restore everything back to its original state */
35110614SJonathan.Adams@Sun.COM for (x = 0; x < rm->rm_firstdatacol; x++)
35210614SJonathan.Adams@Sun.COM rm->rm_col[x].rc_data = bad_parity[x];
35310614SJonathan.Adams@Sun.COM
35410614SJonathan.Adams@Sun.COM buf = rm->rm_datacopy;
35510614SJonathan.Adams@Sun.COM for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
35610614SJonathan.Adams@Sun.COM rm->rm_col[x].rc_data = buf;
35710614SJonathan.Adams@Sun.COM buf += rm->rm_col[x].rc_size;
35810614SJonathan.Adams@Sun.COM }
35910614SJonathan.Adams@Sun.COM }
36010614SJonathan.Adams@Sun.COM
36110614SJonathan.Adams@Sun.COM ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
36210614SJonathan.Adams@Sun.COM good = rm->rm_col[c].rc_gdata;
36310614SJonathan.Adams@Sun.COM } else {
36410614SJonathan.Adams@Sun.COM /* adjust good_data to point at the start of our column */
36510614SJonathan.Adams@Sun.COM good = good_data;
36610614SJonathan.Adams@Sun.COM
36710614SJonathan.Adams@Sun.COM for (x = rm->rm_firstdatacol; x < c; x++)
36810614SJonathan.Adams@Sun.COM good += rm->rm_col[x].rc_size;
36910614SJonathan.Adams@Sun.COM }
37010614SJonathan.Adams@Sun.COM
37110614SJonathan.Adams@Sun.COM /* we drop the ereport if it ends up that the data was good */
37210614SJonathan.Adams@Sun.COM zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
37310614SJonathan.Adams@Sun.COM }
37410614SJonathan.Adams@Sun.COM
37510614SJonathan.Adams@Sun.COM /*
37610614SJonathan.Adams@Sun.COM * Invoked indirectly by zfs_ereport_start_checksum(), called
37710614SJonathan.Adams@Sun.COM * below when our read operation fails completely. The main point
37810614SJonathan.Adams@Sun.COM * is to keep a copy of everything we read from disk, so that at
37910614SJonathan.Adams@Sun.COM * vdev_raidz_cksum_finish() time we can compare it with the good data.
38010614SJonathan.Adams@Sun.COM */
38110614SJonathan.Adams@Sun.COM static void
vdev_raidz_cksum_report(zio_t * zio,zio_cksum_report_t * zcr,void * arg)38210614SJonathan.Adams@Sun.COM vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
38310614SJonathan.Adams@Sun.COM {
38410614SJonathan.Adams@Sun.COM size_t c = (size_t)(uintptr_t)arg;
38510614SJonathan.Adams@Sun.COM caddr_t buf;
38610614SJonathan.Adams@Sun.COM
38710614SJonathan.Adams@Sun.COM raidz_map_t *rm = zio->io_vsd;
38810614SJonathan.Adams@Sun.COM size_t size;
38910614SJonathan.Adams@Sun.COM
39010614SJonathan.Adams@Sun.COM /* set up the report and bump the refcount */
39110614SJonathan.Adams@Sun.COM zcr->zcr_cbdata = rm;
39210614SJonathan.Adams@Sun.COM zcr->zcr_cbinfo = c;
39310614SJonathan.Adams@Sun.COM zcr->zcr_finish = vdev_raidz_cksum_finish;
39410614SJonathan.Adams@Sun.COM zcr->zcr_free = vdev_raidz_cksum_free;
39510614SJonathan.Adams@Sun.COM
39610614SJonathan.Adams@Sun.COM rm->rm_reports++;
39710614SJonathan.Adams@Sun.COM ASSERT3U(rm->rm_reports, >, 0);
39810614SJonathan.Adams@Sun.COM
39910653SJonathan.Adams@Sun.COM if (rm->rm_datacopy != NULL)
40010614SJonathan.Adams@Sun.COM return;
40110614SJonathan.Adams@Sun.COM
40210614SJonathan.Adams@Sun.COM /*
40310653SJonathan.Adams@Sun.COM * It's the first time we're called for this raidz_map_t, so we need
40410653SJonathan.Adams@Sun.COM * to copy the data aside; there's no guarantee that our zio's buffer
40510653SJonathan.Adams@Sun.COM * won't be re-used for something else.
40610614SJonathan.Adams@Sun.COM *
40710653SJonathan.Adams@Sun.COM * Our parity data is already in separate buffers, so there's no need
40810614SJonathan.Adams@Sun.COM * to copy them.
40910614SJonathan.Adams@Sun.COM */
41010614SJonathan.Adams@Sun.COM
41110653SJonathan.Adams@Sun.COM size = 0;
41210653SJonathan.Adams@Sun.COM for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
41310653SJonathan.Adams@Sun.COM size += rm->rm_col[c].rc_size;
41410614SJonathan.Adams@Sun.COM
41510614SJonathan.Adams@Sun.COM buf = rm->rm_datacopy = zio_buf_alloc(size);
41610653SJonathan.Adams@Sun.COM
41710653SJonathan.Adams@Sun.COM for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
41810614SJonathan.Adams@Sun.COM raidz_col_t *col = &rm->rm_col[c];
41910614SJonathan.Adams@Sun.COM
42010614SJonathan.Adams@Sun.COM bcopy(col->rc_data, buf, col->rc_size);
42110614SJonathan.Adams@Sun.COM col->rc_data = buf;
42210614SJonathan.Adams@Sun.COM
42310614SJonathan.Adams@Sun.COM buf += col->rc_size;
42410614SJonathan.Adams@Sun.COM }
42510614SJonathan.Adams@Sun.COM ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
42610614SJonathan.Adams@Sun.COM }
42710614SJonathan.Adams@Sun.COM
42810614SJonathan.Adams@Sun.COM static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
42910614SJonathan.Adams@Sun.COM vdev_raidz_map_free_vsd,
43010614SJonathan.Adams@Sun.COM vdev_raidz_cksum_report
43110614SJonathan.Adams@Sun.COM };
43210614SJonathan.Adams@Sun.COM
433789Sahrens static raidz_map_t *
vdev_raidz_map_alloc(zio_t * zio,uint64_t unit_shift,uint64_t dcols,uint64_t nparity)4342082Seschrock vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
4352082Seschrock uint64_t nparity)
436789Sahrens {
437789Sahrens raidz_map_t *rm;
438789Sahrens uint64_t b = zio->io_offset >> unit_shift;
439789Sahrens uint64_t s = zio->io_size >> unit_shift;
440789Sahrens uint64_t f = b % dcols;
441789Sahrens uint64_t o = (b / dcols) << unit_shift;
44210105Sadam.leventhal@sun.com uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
443789Sahrens
4442082Seschrock q = s / (dcols - nparity);
4452082Seschrock r = s - q * (dcols - nparity);
4462082Seschrock bc = (r == 0 ? 0 : r + nparity);
44710105Sadam.leventhal@sun.com tot = s + nparity * (q + (r == 0 ? 0 : 1));
448789Sahrens
44910105Sadam.leventhal@sun.com if (q == 0) {
45010105Sadam.leventhal@sun.com acols = bc;
45110105Sadam.leventhal@sun.com scols = MIN(dcols, roundup(bc, nparity + 1));
45210105Sadam.leventhal@sun.com } else {
45310105Sadam.leventhal@sun.com acols = dcols;
45410105Sadam.leventhal@sun.com scols = dcols;
45510105Sadam.leventhal@sun.com }
456789Sahrens
45710105Sadam.leventhal@sun.com ASSERT3U(acols, <=, scols);
45810105Sadam.leventhal@sun.com
45910105Sadam.leventhal@sun.com rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
460789Sahrens
461789Sahrens rm->rm_cols = acols;
46210105Sadam.leventhal@sun.com rm->rm_scols = scols;
463789Sahrens rm->rm_bigcols = bc;
46410450Sadam.leventhal@sun.com rm->rm_skipstart = bc;
4652082Seschrock rm->rm_missingdata = 0;
4662082Seschrock rm->rm_missingparity = 0;
4672082Seschrock rm->rm_firstdatacol = nparity;
46810614SJonathan.Adams@Sun.COM rm->rm_datacopy = NULL;
46910614SJonathan.Adams@Sun.COM rm->rm_reports = 0;
47010614SJonathan.Adams@Sun.COM rm->rm_freed = 0;
47110614SJonathan.Adams@Sun.COM rm->rm_ecksuminjected = 0;
472789Sahrens
47310105Sadam.leventhal@sun.com asize = 0;
47410105Sadam.leventhal@sun.com
47510105Sadam.leventhal@sun.com for (c = 0; c < scols; c++) {
476789Sahrens col = f + c;
477789Sahrens coff = o;
478789Sahrens if (col >= dcols) {
479789Sahrens col -= dcols;
480789Sahrens coff += 1ULL << unit_shift;
481789Sahrens }
4822082Seschrock rm->rm_col[c].rc_devidx = col;
483789Sahrens rm->rm_col[c].rc_offset = coff;
484789Sahrens rm->rm_col[c].rc_data = NULL;
48510614SJonathan.Adams@Sun.COM rm->rm_col[c].rc_gdata = NULL;
486789Sahrens rm->rm_col[c].rc_error = 0;
487789Sahrens rm->rm_col[c].rc_tried = 0;
488789Sahrens rm->rm_col[c].rc_skipped = 0;
48910105Sadam.leventhal@sun.com
49010105Sadam.leventhal@sun.com if (c >= acols)
49110105Sadam.leventhal@sun.com rm->rm_col[c].rc_size = 0;
49210105Sadam.leventhal@sun.com else if (c < bc)
49310105Sadam.leventhal@sun.com rm->rm_col[c].rc_size = (q + 1) << unit_shift;
49410105Sadam.leventhal@sun.com else
49510105Sadam.leventhal@sun.com rm->rm_col[c].rc_size = q << unit_shift;
49610105Sadam.leventhal@sun.com
49710105Sadam.leventhal@sun.com asize += rm->rm_col[c].rc_size;
498789Sahrens }
499789Sahrens
50010105Sadam.leventhal@sun.com ASSERT3U(asize, ==, tot << unit_shift);
50110105Sadam.leventhal@sun.com rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
50210450Sadam.leventhal@sun.com rm->rm_nskip = roundup(tot, nparity + 1) - tot;
50310450Sadam.leventhal@sun.com ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
50410450Sadam.leventhal@sun.com ASSERT3U(rm->rm_nskip, <=, nparity);
505789Sahrens
506789Sahrens for (c = 0; c < rm->rm_firstdatacol; c++)
507789Sahrens rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
508789Sahrens
509789Sahrens rm->rm_col[c].rc_data = zio->io_data;
510789Sahrens
511789Sahrens for (c = c + 1; c < acols; c++)
512789Sahrens rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
513789Sahrens rm->rm_col[c - 1].rc_size;
514789Sahrens
5151133Seschrock /*
5162082Seschrock * If all data stored spans all columns, there's a danger that parity
5172082Seschrock * will always be on the same device and, since parity isn't read
5182082Seschrock * during normal operation, that that device's I/O bandwidth won't be
5192082Seschrock * used effectively. We therefore switch the parity every 1MB.
5202082Seschrock *
5212082Seschrock * ... at least that was, ostensibly, the theory. As a practical
5222082Seschrock * matter unless we juggle the parity between all devices evenly, we
5232082Seschrock * won't see any benefit. Further, occasional writes that aren't a
5242082Seschrock * multiple of the LCM of the number of children and the minimum
5252082Seschrock * stripe width are sufficient to avoid pessimal behavior.
5262082Seschrock * Unfortunately, this decision created an implicit on-disk format
5273456Sahl * requirement that we need to support for all eternity, but only
5283456Sahl * for single-parity RAID-Z.
52910450Sadam.leventhal@sun.com *
53010450Sadam.leventhal@sun.com * If we intend to skip a sector in the zeroth column for padding
53110450Sadam.leventhal@sun.com * we must make sure to note this swap. We will never intend to
53210450Sadam.leventhal@sun.com * skip the first column since at least one data and one parity
53310450Sadam.leventhal@sun.com * column must appear in each row.
5341133Seschrock */
5351133Seschrock ASSERT(rm->rm_cols >= 2);
5361133Seschrock ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
537789Sahrens
5382082Seschrock if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
5392082Seschrock devidx = rm->rm_col[0].rc_devidx;
5401133Seschrock o = rm->rm_col[0].rc_offset;
5412082Seschrock rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
5421133Seschrock rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
5432082Seschrock rm->rm_col[1].rc_devidx = devidx;
5441133Seschrock rm->rm_col[1].rc_offset = o;
54510450Sadam.leventhal@sun.com
54610450Sadam.leventhal@sun.com if (rm->rm_skipstart == 0)
54710450Sadam.leventhal@sun.com rm->rm_skipstart = 1;
548789Sahrens }
549789Sahrens
550789Sahrens zio->io_vsd = rm;
55110614SJonathan.Adams@Sun.COM zio->io_vsd_ops = &vdev_raidz_vsd_ops;
552789Sahrens return (rm);
553789Sahrens }
554789Sahrens
555789Sahrens static void
vdev_raidz_generate_parity_p(raidz_map_t * rm)5562082Seschrock vdev_raidz_generate_parity_p(raidz_map_t *rm)
5572082Seschrock {
5582082Seschrock uint64_t *p, *src, pcount, ccount, i;
5592082Seschrock int c;
5602082Seschrock
5612082Seschrock pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
5622082Seschrock
5632082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
5642082Seschrock src = rm->rm_col[c].rc_data;
5652082Seschrock p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
5662082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
5672082Seschrock
5682082Seschrock if (c == rm->rm_firstdatacol) {
5692082Seschrock ASSERT(ccount == pcount);
57010105Sadam.leventhal@sun.com for (i = 0; i < ccount; i++, src++, p++) {
5712082Seschrock *p = *src;
5722082Seschrock }
5732082Seschrock } else {
5742082Seschrock ASSERT(ccount <= pcount);
57510105Sadam.leventhal@sun.com for (i = 0; i < ccount; i++, src++, p++) {
5762082Seschrock *p ^= *src;
5772082Seschrock }
5782082Seschrock }
5792082Seschrock }
5802082Seschrock }
5812082Seschrock
5822082Seschrock static void
vdev_raidz_generate_parity_pq(raidz_map_t * rm)5832082Seschrock vdev_raidz_generate_parity_pq(raidz_map_t *rm)
584789Sahrens {
58510105Sadam.leventhal@sun.com uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
5862082Seschrock int c;
5872082Seschrock
58810105Sadam.leventhal@sun.com pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
5892082Seschrock ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
5902082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_size);
5912082Seschrock
5922082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
5932082Seschrock src = rm->rm_col[c].rc_data;
5942082Seschrock p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
5952082Seschrock q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
59610105Sadam.leventhal@sun.com
59710105Sadam.leventhal@sun.com ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
5982082Seschrock
5992082Seschrock if (c == rm->rm_firstdatacol) {
60010105Sadam.leventhal@sun.com ASSERT(ccnt == pcnt || ccnt == 0);
60110105Sadam.leventhal@sun.com for (i = 0; i < ccnt; i++, src++, p++, q++) {
6022082Seschrock *p = *src;
60310105Sadam.leventhal@sun.com *q = *src;
6042082Seschrock }
60510105Sadam.leventhal@sun.com for (; i < pcnt; i++, src++, p++, q++) {
60610105Sadam.leventhal@sun.com *p = 0;
6072082Seschrock *q = 0;
6082082Seschrock }
6092082Seschrock } else {
61010105Sadam.leventhal@sun.com ASSERT(ccnt <= pcnt);
611789Sahrens
6122082Seschrock /*
61310105Sadam.leventhal@sun.com * Apply the algorithm described above by multiplying
61410105Sadam.leventhal@sun.com * the previous result and adding in the new value.
6152082Seschrock */
61610105Sadam.leventhal@sun.com for (i = 0; i < ccnt; i++, src++, p++, q++) {
61710105Sadam.leventhal@sun.com *p ^= *src;
61810105Sadam.leventhal@sun.com
61910105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*q, mask);
6202082Seschrock *q ^= *src;
6212082Seschrock }
6222082Seschrock
6232082Seschrock /*
6242082Seschrock * Treat short columns as though they are full of 0s.
62510105Sadam.leventhal@sun.com * Note that there's therefore nothing needed for P.
6262082Seschrock */
62710105Sadam.leventhal@sun.com for (; i < pcnt; i++, q++) {
62810105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*q, mask);
6292082Seschrock }
6302082Seschrock }
6312082Seschrock }
6322082Seschrock }
6332082Seschrock
6342082Seschrock static void
vdev_raidz_generate_parity_pqr(raidz_map_t * rm)63510105Sadam.leventhal@sun.com vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
63610105Sadam.leventhal@sun.com {
63710105Sadam.leventhal@sun.com uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
63810105Sadam.leventhal@sun.com int c;
63910105Sadam.leventhal@sun.com
64010105Sadam.leventhal@sun.com pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
64110105Sadam.leventhal@sun.com ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
64210105Sadam.leventhal@sun.com rm->rm_col[VDEV_RAIDZ_Q].rc_size);
64310105Sadam.leventhal@sun.com ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
64410105Sadam.leventhal@sun.com rm->rm_col[VDEV_RAIDZ_R].rc_size);
64510105Sadam.leventhal@sun.com
64610105Sadam.leventhal@sun.com for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
64710105Sadam.leventhal@sun.com src = rm->rm_col[c].rc_data;
64810105Sadam.leventhal@sun.com p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
64910105Sadam.leventhal@sun.com q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
65010105Sadam.leventhal@sun.com r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
65110105Sadam.leventhal@sun.com
65210105Sadam.leventhal@sun.com ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
65310105Sadam.leventhal@sun.com
65410105Sadam.leventhal@sun.com if (c == rm->rm_firstdatacol) {
65510105Sadam.leventhal@sun.com ASSERT(ccnt == pcnt || ccnt == 0);
65610105Sadam.leventhal@sun.com for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
65710105Sadam.leventhal@sun.com *p = *src;
65810105Sadam.leventhal@sun.com *q = *src;
65910105Sadam.leventhal@sun.com *r = *src;
66010105Sadam.leventhal@sun.com }
66110105Sadam.leventhal@sun.com for (; i < pcnt; i++, src++, p++, q++, r++) {
66210105Sadam.leventhal@sun.com *p = 0;
66310105Sadam.leventhal@sun.com *q = 0;
66410105Sadam.leventhal@sun.com *r = 0;
66510105Sadam.leventhal@sun.com }
66610105Sadam.leventhal@sun.com } else {
66710105Sadam.leventhal@sun.com ASSERT(ccnt <= pcnt);
66810105Sadam.leventhal@sun.com
66910105Sadam.leventhal@sun.com /*
67010105Sadam.leventhal@sun.com * Apply the algorithm described above by multiplying
67110105Sadam.leventhal@sun.com * the previous result and adding in the new value.
67210105Sadam.leventhal@sun.com */
67310105Sadam.leventhal@sun.com for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
67410105Sadam.leventhal@sun.com *p ^= *src;
67510105Sadam.leventhal@sun.com
67610105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*q, mask);
67710105Sadam.leventhal@sun.com *q ^= *src;
67810105Sadam.leventhal@sun.com
67910105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_4(*r, mask);
68010105Sadam.leventhal@sun.com *r ^= *src;
68110105Sadam.leventhal@sun.com }
68210105Sadam.leventhal@sun.com
68310105Sadam.leventhal@sun.com /*
68410105Sadam.leventhal@sun.com * Treat short columns as though they are full of 0s.
68510105Sadam.leventhal@sun.com * Note that there's therefore nothing needed for P.
68610105Sadam.leventhal@sun.com */
68710105Sadam.leventhal@sun.com for (; i < pcnt; i++, q++, r++) {
68810105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*q, mask);
68910105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_4(*r, mask);
69010105Sadam.leventhal@sun.com }
69110105Sadam.leventhal@sun.com }
69210105Sadam.leventhal@sun.com }
69310105Sadam.leventhal@sun.com }
69410105Sadam.leventhal@sun.com
69510105Sadam.leventhal@sun.com /*
69610105Sadam.leventhal@sun.com * Generate RAID parity in the first virtual columns according to the number of
69710105Sadam.leventhal@sun.com * parity columns available.
69810105Sadam.leventhal@sun.com */
69910105Sadam.leventhal@sun.com static void
vdev_raidz_generate_parity(raidz_map_t * rm)70010105Sadam.leventhal@sun.com vdev_raidz_generate_parity(raidz_map_t *rm)
70110105Sadam.leventhal@sun.com {
70210105Sadam.leventhal@sun.com switch (rm->rm_firstdatacol) {
70310105Sadam.leventhal@sun.com case 1:
70410105Sadam.leventhal@sun.com vdev_raidz_generate_parity_p(rm);
70510105Sadam.leventhal@sun.com break;
70610105Sadam.leventhal@sun.com case 2:
70710105Sadam.leventhal@sun.com vdev_raidz_generate_parity_pq(rm);
70810105Sadam.leventhal@sun.com break;
70910105Sadam.leventhal@sun.com case 3:
71010105Sadam.leventhal@sun.com vdev_raidz_generate_parity_pqr(rm);
71110105Sadam.leventhal@sun.com break;
71210105Sadam.leventhal@sun.com default:
71310105Sadam.leventhal@sun.com cmn_err(CE_PANIC, "invalid RAID-Z configuration");
71410105Sadam.leventhal@sun.com }
71510105Sadam.leventhal@sun.com }
71610105Sadam.leventhal@sun.com
71710105Sadam.leventhal@sun.com static int
vdev_raidz_reconstruct_p(raidz_map_t * rm,int * tgts,int ntgts)71810105Sadam.leventhal@sun.com vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
7192082Seschrock {
7202082Seschrock uint64_t *dst, *src, xcount, ccount, count, i;
72110105Sadam.leventhal@sun.com int x = tgts[0];
7222082Seschrock int c;
7232082Seschrock
72410105Sadam.leventhal@sun.com ASSERT(ntgts == 1);
72510105Sadam.leventhal@sun.com ASSERT(x >= rm->rm_firstdatacol);
72610105Sadam.leventhal@sun.com ASSERT(x < rm->rm_cols);
72710105Sadam.leventhal@sun.com
7282082Seschrock xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
7292082Seschrock ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
7302082Seschrock ASSERT(xcount > 0);
7312082Seschrock
7322082Seschrock src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
7332082Seschrock dst = rm->rm_col[x].rc_data;
7342082Seschrock for (i = 0; i < xcount; i++, dst++, src++) {
7352082Seschrock *dst = *src;
7362082Seschrock }
7372082Seschrock
7382082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
739789Sahrens src = rm->rm_col[c].rc_data;
740789Sahrens dst = rm->rm_col[x].rc_data;
7412082Seschrock
7422082Seschrock if (c == x)
7432082Seschrock continue;
7442082Seschrock
7452082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
7462082Seschrock count = MIN(ccount, xcount);
7472082Seschrock
7482082Seschrock for (i = 0; i < count; i++, dst++, src++) {
7492082Seschrock *dst ^= *src;
750789Sahrens }
751789Sahrens }
75210105Sadam.leventhal@sun.com
75310105Sadam.leventhal@sun.com return (1 << VDEV_RAIDZ_P);
754789Sahrens }
755789Sahrens
75610105Sadam.leventhal@sun.com static int
vdev_raidz_reconstruct_q(raidz_map_t * rm,int * tgts,int ntgts)75710105Sadam.leventhal@sun.com vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
7582082Seschrock {
7592082Seschrock uint64_t *dst, *src, xcount, ccount, count, mask, i;
7602082Seschrock uint8_t *b;
76110105Sadam.leventhal@sun.com int x = tgts[0];
7622082Seschrock int c, j, exp;
7632082Seschrock
76410105Sadam.leventhal@sun.com ASSERT(ntgts == 1);
76510105Sadam.leventhal@sun.com
7662082Seschrock xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
7672082Seschrock ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
7682082Seschrock
7692082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
7702082Seschrock src = rm->rm_col[c].rc_data;
7712082Seschrock dst = rm->rm_col[x].rc_data;
7722082Seschrock
7732082Seschrock if (c == x)
7742082Seschrock ccount = 0;
7752082Seschrock else
7762082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
7772082Seschrock
7782082Seschrock count = MIN(ccount, xcount);
7792082Seschrock
7802082Seschrock if (c == rm->rm_firstdatacol) {
7812082Seschrock for (i = 0; i < count; i++, dst++, src++) {
7822082Seschrock *dst = *src;
7832082Seschrock }
7842082Seschrock for (; i < xcount; i++, dst++) {
7852082Seschrock *dst = 0;
7862082Seschrock }
7872082Seschrock
7882082Seschrock } else {
7892082Seschrock for (i = 0; i < count; i++, dst++, src++) {
79010105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*dst, mask);
7912082Seschrock *dst ^= *src;
7922082Seschrock }
7932082Seschrock
7942082Seschrock for (; i < xcount; i++, dst++) {
79510105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*dst, mask);
7962082Seschrock }
7972082Seschrock }
7982082Seschrock }
7992082Seschrock
8002082Seschrock src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
8012082Seschrock dst = rm->rm_col[x].rc_data;
8022082Seschrock exp = 255 - (rm->rm_cols - 1 - x);
8032082Seschrock
8042082Seschrock for (i = 0; i < xcount; i++, dst++, src++) {
8052082Seschrock *dst ^= *src;
8062082Seschrock for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
8072082Seschrock *b = vdev_raidz_exp2(*b, exp);
8082082Seschrock }
8092082Seschrock }
81010105Sadam.leventhal@sun.com
81110105Sadam.leventhal@sun.com return (1 << VDEV_RAIDZ_Q);
8122082Seschrock }
8132082Seschrock
81410105Sadam.leventhal@sun.com static int
vdev_raidz_reconstruct_pq(raidz_map_t * rm,int * tgts,int ntgts)81510105Sadam.leventhal@sun.com vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
8162082Seschrock {
8172082Seschrock uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
8182082Seschrock void *pdata, *qdata;
8192082Seschrock uint64_t xsize, ysize, i;
82010105Sadam.leventhal@sun.com int x = tgts[0];
82110105Sadam.leventhal@sun.com int y = tgts[1];
8222082Seschrock
82310105Sadam.leventhal@sun.com ASSERT(ntgts == 2);
8242082Seschrock ASSERT(x < y);
8252082Seschrock ASSERT(x >= rm->rm_firstdatacol);
8262082Seschrock ASSERT(y < rm->rm_cols);
8272082Seschrock
8282082Seschrock ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
8292082Seschrock
8302082Seschrock /*
8312082Seschrock * Move the parity data aside -- we're going to compute parity as
8322082Seschrock * though columns x and y were full of zeros -- Pxy and Qxy. We want to
8332082Seschrock * reuse the parity generation mechanism without trashing the actual
8342082Seschrock * parity so we make those columns appear to be full of zeros by
8352082Seschrock * setting their lengths to zero.
8362082Seschrock */
8372082Seschrock pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
8382082Seschrock qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
8392082Seschrock xsize = rm->rm_col[x].rc_size;
8402082Seschrock ysize = rm->rm_col[y].rc_size;
8412082Seschrock
8422082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_data =
8432082Seschrock zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
8442082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_data =
8452082Seschrock zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
8462082Seschrock rm->rm_col[x].rc_size = 0;
8472082Seschrock rm->rm_col[y].rc_size = 0;
8482082Seschrock
8492082Seschrock vdev_raidz_generate_parity_pq(rm);
8502082Seschrock
8512082Seschrock rm->rm_col[x].rc_size = xsize;
8522082Seschrock rm->rm_col[y].rc_size = ysize;
8532082Seschrock
8542082Seschrock p = pdata;
8552082Seschrock q = qdata;
8562082Seschrock pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
8572082Seschrock qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
8582082Seschrock xd = rm->rm_col[x].rc_data;
8592082Seschrock yd = rm->rm_col[y].rc_data;
8602082Seschrock
8612082Seschrock /*
8622082Seschrock * We now have:
8632082Seschrock * Pxy = P + D_x + D_y
8642082Seschrock * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
8652082Seschrock *
8662082Seschrock * We can then solve for D_x:
8672082Seschrock * D_x = A * (P + Pxy) + B * (Q + Qxy)
8682082Seschrock * where
8692082Seschrock * A = 2^(x - y) * (2^(x - y) + 1)^-1
8702082Seschrock * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
8712082Seschrock *
8722082Seschrock * With D_x in hand, we can easily solve for D_y:
8732082Seschrock * D_y = P + Pxy + D_x
8742082Seschrock */
8752082Seschrock
8762082Seschrock a = vdev_raidz_pow2[255 + x - y];
8772082Seschrock b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
8782082Seschrock tmp = 255 - vdev_raidz_log2[a ^ 1];
8792082Seschrock
8802082Seschrock aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
8812082Seschrock bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
8822082Seschrock
8832082Seschrock for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
8842082Seschrock *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
8852082Seschrock vdev_raidz_exp2(*q ^ *qxy, bexp);
8862082Seschrock
8872082Seschrock if (i < ysize)
8882082Seschrock *yd = *p ^ *pxy ^ *xd;
8892082Seschrock }
8902082Seschrock
8912082Seschrock zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
8922082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_size);
8932082Seschrock zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
8942082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_size);
8952082Seschrock
8962082Seschrock /*
8972082Seschrock * Restore the saved parity data.
8982082Seschrock */
8992082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
9002082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
90110105Sadam.leventhal@sun.com
90210105Sadam.leventhal@sun.com return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
90310105Sadam.leventhal@sun.com }
90410105Sadam.leventhal@sun.com
90510105Sadam.leventhal@sun.com /* BEGIN CSTYLED */
90610105Sadam.leventhal@sun.com /*
90710105Sadam.leventhal@sun.com * In the general case of reconstruction, we must solve the system of linear
90810105Sadam.leventhal@sun.com * equations defined by the coeffecients used to generate parity as well as
90910105Sadam.leventhal@sun.com * the contents of the data and parity disks. This can be expressed with
91010105Sadam.leventhal@sun.com * vectors for the original data (D) and the actual data (d) and parity (p)
91110105Sadam.leventhal@sun.com * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
91210105Sadam.leventhal@sun.com *
91310105Sadam.leventhal@sun.com * __ __ __ __
91410105Sadam.leventhal@sun.com * | | __ __ | p_0 |
91510105Sadam.leventhal@sun.com * | V | | D_0 | | p_m-1 |
91610105Sadam.leventhal@sun.com * | | x | : | = | d_0 |
91710105Sadam.leventhal@sun.com * | I | | D_n-1 | | : |
91810105Sadam.leventhal@sun.com * | | ~~ ~~ | d_n-1 |
91910105Sadam.leventhal@sun.com * ~~ ~~ ~~ ~~
92010105Sadam.leventhal@sun.com *
92110105Sadam.leventhal@sun.com * I is simply a square identity matrix of size n, and V is a vandermonde
92210105Sadam.leventhal@sun.com * matrix defined by the coeffecients we chose for the various parity columns
92310105Sadam.leventhal@sun.com * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
92410105Sadam.leventhal@sun.com * computation as well as linear separability.
92510105Sadam.leventhal@sun.com *
92610105Sadam.leventhal@sun.com * __ __ __ __
92710105Sadam.leventhal@sun.com * | 1 .. 1 1 1 | | p_0 |
92810105Sadam.leventhal@sun.com * | 2^n-1 .. 4 2 1 | __ __ | : |
92910105Sadam.leventhal@sun.com * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
93010105Sadam.leventhal@sun.com * | 1 .. 0 0 0 | | D_1 | | d_0 |
93110105Sadam.leventhal@sun.com * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
93210105Sadam.leventhal@sun.com * | : : : : | | : | | d_2 |
93310105Sadam.leventhal@sun.com * | 0 .. 1 0 0 | | D_n-1 | | : |
93410105Sadam.leventhal@sun.com * | 0 .. 0 1 0 | ~~ ~~ | : |
93510105Sadam.leventhal@sun.com * | 0 .. 0 0 1 | | d_n-1 |
93610105Sadam.leventhal@sun.com * ~~ ~~ ~~ ~~
93710105Sadam.leventhal@sun.com *
93810105Sadam.leventhal@sun.com * Note that I, V, d, and p are known. To compute D, we must invert the
93910105Sadam.leventhal@sun.com * matrix and use the known data and parity values to reconstruct the unknown
94010105Sadam.leventhal@sun.com * data values. We begin by removing the rows in V|I and d|p that correspond
94110105Sadam.leventhal@sun.com * to failed or missing columns; we then make V|I square (n x n) and d|p
94210105Sadam.leventhal@sun.com * sized n by removing rows corresponding to unused parity from the bottom up
94310105Sadam.leventhal@sun.com * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
94410105Sadam.leventhal@sun.com * using Gauss-Jordan elimination. In the example below we use m=3 parity
94510105Sadam.leventhal@sun.com * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
94610105Sadam.leventhal@sun.com * __ __
94710105Sadam.leventhal@sun.com * | 1 1 1 1 1 1 1 1 |
94810105Sadam.leventhal@sun.com * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
94910105Sadam.leventhal@sun.com * | 19 205 116 29 64 16 4 1 | / /
95010105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 | / /
95110105Sadam.leventhal@sun.com * | 0 1 0 0 0 0 0 0 | <--' /
95210105Sadam.leventhal@sun.com * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
95310105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 |
95410105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 |
95510105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 |
95610105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 |
95710105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 |
95810105Sadam.leventhal@sun.com * ~~ ~~
95910105Sadam.leventhal@sun.com * __ __
96010105Sadam.leventhal@sun.com * | 1 1 1 1 1 1 1 1 |
96110105Sadam.leventhal@sun.com * | 128 64 32 16 8 4 2 1 |
96210105Sadam.leventhal@sun.com * | 19 205 116 29 64 16 4 1 |
96310105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 |
96410105Sadam.leventhal@sun.com * | 0 1 0 0 0 0 0 0 |
96510105Sadam.leventhal@sun.com * (V|I)' = | 0 0 1 0 0 0 0 0 |
96610105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 |
96710105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 |
96810105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 |
96910105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 |
97010105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 |
97110105Sadam.leventhal@sun.com * ~~ ~~
97210105Sadam.leventhal@sun.com *
97310105Sadam.leventhal@sun.com * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
97410105Sadam.leventhal@sun.com * have carefully chosen the seed values 1, 2, and 4 to ensure that this
97510105Sadam.leventhal@sun.com * matrix is not singular.
97610105Sadam.leventhal@sun.com * __ __
97710105Sadam.leventhal@sun.com * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
97810105Sadam.leventhal@sun.com * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
97910105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
98010105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
98110105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
98210105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
98310105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
98410105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
98510105Sadam.leventhal@sun.com * ~~ ~~
98610105Sadam.leventhal@sun.com * __ __
98710105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
98810105Sadam.leventhal@sun.com * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
98910105Sadam.leventhal@sun.com * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
99010105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
99110105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
99210105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
99310105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
99410105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
99510105Sadam.leventhal@sun.com * ~~ ~~
99610105Sadam.leventhal@sun.com * __ __
99710105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
99810105Sadam.leventhal@sun.com * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
99910105Sadam.leventhal@sun.com * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
100010105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
100110105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
100210105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
100310105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
100410105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
100510105Sadam.leventhal@sun.com * ~~ ~~
100610105Sadam.leventhal@sun.com * __ __
100710105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
100810105Sadam.leventhal@sun.com * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
100910105Sadam.leventhal@sun.com * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
101010105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
101110105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
101210105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
101310105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
101410105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
101510105Sadam.leventhal@sun.com * ~~ ~~
101610105Sadam.leventhal@sun.com * __ __
101710105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
101810105Sadam.leventhal@sun.com * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
101910105Sadam.leventhal@sun.com * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
102010105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
102110105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
102210105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
102310105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
102410105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
102510105Sadam.leventhal@sun.com * ~~ ~~
102610105Sadam.leventhal@sun.com * __ __
102710105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
102810105Sadam.leventhal@sun.com * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
102910105Sadam.leventhal@sun.com * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
103010105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
103110105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
103210105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
103310105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
103410105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
103510105Sadam.leventhal@sun.com * ~~ ~~
103610105Sadam.leventhal@sun.com * __ __
103710105Sadam.leventhal@sun.com * | 0 0 1 0 0 0 0 0 |
103810105Sadam.leventhal@sun.com * | 167 100 5 41 159 169 217 208 |
103910105Sadam.leventhal@sun.com * | 166 100 4 40 158 168 216 209 |
104010105Sadam.leventhal@sun.com * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
104110105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 |
104210105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 |
104310105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 |
104410105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 |
104510105Sadam.leventhal@sun.com * ~~ ~~
104610105Sadam.leventhal@sun.com *
104710105Sadam.leventhal@sun.com * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
104810105Sadam.leventhal@sun.com * of the missing data.
104910105Sadam.leventhal@sun.com *
105010105Sadam.leventhal@sun.com * As is apparent from the example above, the only non-trivial rows in the
105110105Sadam.leventhal@sun.com * inverse matrix correspond to the data disks that we're trying to
105210105Sadam.leventhal@sun.com * reconstruct. Indeed, those are the only rows we need as the others would
105310105Sadam.leventhal@sun.com * only be useful for reconstructing data known or assumed to be valid. For
105410105Sadam.leventhal@sun.com * that reason, we only build the coefficients in the rows that correspond to
105510105Sadam.leventhal@sun.com * targeted columns.
105610105Sadam.leventhal@sun.com */
105710105Sadam.leventhal@sun.com /* END CSTYLED */
105810105Sadam.leventhal@sun.com
105910105Sadam.leventhal@sun.com static void
vdev_raidz_matrix_init(raidz_map_t * rm,int n,int nmap,int * map,uint8_t ** rows)106010105Sadam.leventhal@sun.com vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
106110105Sadam.leventhal@sun.com uint8_t **rows)
106210105Sadam.leventhal@sun.com {
106310105Sadam.leventhal@sun.com int i, j;
106410105Sadam.leventhal@sun.com int pow;
106510105Sadam.leventhal@sun.com
106610105Sadam.leventhal@sun.com ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
106710105Sadam.leventhal@sun.com
106810105Sadam.leventhal@sun.com /*
106910105Sadam.leventhal@sun.com * Fill in the missing rows of interest.
107010105Sadam.leventhal@sun.com */
107110105Sadam.leventhal@sun.com for (i = 0; i < nmap; i++) {
107210105Sadam.leventhal@sun.com ASSERT3S(0, <=, map[i]);
107310105Sadam.leventhal@sun.com ASSERT3S(map[i], <=, 2);
107410105Sadam.leventhal@sun.com
107510105Sadam.leventhal@sun.com pow = map[i] * n;
107610105Sadam.leventhal@sun.com if (pow > 255)
107710105Sadam.leventhal@sun.com pow -= 255;
107810105Sadam.leventhal@sun.com ASSERT(pow <= 255);
107910105Sadam.leventhal@sun.com
108010105Sadam.leventhal@sun.com for (j = 0; j < n; j++) {
108110105Sadam.leventhal@sun.com pow -= map[i];
108210105Sadam.leventhal@sun.com if (pow < 0)
108310105Sadam.leventhal@sun.com pow += 255;
108410105Sadam.leventhal@sun.com rows[i][j] = vdev_raidz_pow2[pow];
108510105Sadam.leventhal@sun.com }
108610105Sadam.leventhal@sun.com }
10872082Seschrock }
10882082Seschrock
108910105Sadam.leventhal@sun.com static void
vdev_raidz_matrix_invert(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** rows,uint8_t ** invrows,const uint8_t * used)109010105Sadam.leventhal@sun.com vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
109110105Sadam.leventhal@sun.com uint8_t **rows, uint8_t **invrows, const uint8_t *used)
109210105Sadam.leventhal@sun.com {
109310105Sadam.leventhal@sun.com int i, j, ii, jj;
109410105Sadam.leventhal@sun.com uint8_t log;
109510105Sadam.leventhal@sun.com
109610105Sadam.leventhal@sun.com /*
109710105Sadam.leventhal@sun.com * Assert that the first nmissing entries from the array of used
109810105Sadam.leventhal@sun.com * columns correspond to parity columns and that subsequent entries
109910105Sadam.leventhal@sun.com * correspond to data columns.
110010105Sadam.leventhal@sun.com */
110110105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) {
110210105Sadam.leventhal@sun.com ASSERT3S(used[i], <, rm->rm_firstdatacol);
110310105Sadam.leventhal@sun.com }
110410105Sadam.leventhal@sun.com for (; i < n; i++) {
110510105Sadam.leventhal@sun.com ASSERT3S(used[i], >=, rm->rm_firstdatacol);
110610105Sadam.leventhal@sun.com }
110710105Sadam.leventhal@sun.com
110810105Sadam.leventhal@sun.com /*
110910105Sadam.leventhal@sun.com * First initialize the storage where we'll compute the inverse rows.
111010105Sadam.leventhal@sun.com */
111110105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) {
111210105Sadam.leventhal@sun.com for (j = 0; j < n; j++) {
111310105Sadam.leventhal@sun.com invrows[i][j] = (i == j) ? 1 : 0;
111410105Sadam.leventhal@sun.com }
111510105Sadam.leventhal@sun.com }
111610105Sadam.leventhal@sun.com
111710105Sadam.leventhal@sun.com /*
111810105Sadam.leventhal@sun.com * Subtract all trivial rows from the rows of consequence.
111910105Sadam.leventhal@sun.com */
112010105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) {
112110105Sadam.leventhal@sun.com for (j = nmissing; j < n; j++) {
112210105Sadam.leventhal@sun.com ASSERT3U(used[j], >=, rm->rm_firstdatacol);
112310105Sadam.leventhal@sun.com jj = used[j] - rm->rm_firstdatacol;
112410105Sadam.leventhal@sun.com ASSERT3S(jj, <, n);
112510105Sadam.leventhal@sun.com invrows[i][j] = rows[i][jj];
112610105Sadam.leventhal@sun.com rows[i][jj] = 0;
112710105Sadam.leventhal@sun.com }
112810105Sadam.leventhal@sun.com }
112910105Sadam.leventhal@sun.com
113010105Sadam.leventhal@sun.com /*
113110105Sadam.leventhal@sun.com * For each of the rows of interest, we must normalize it and subtract
113210105Sadam.leventhal@sun.com * a multiple of it from the other rows.
113310105Sadam.leventhal@sun.com */
113410105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) {
113510105Sadam.leventhal@sun.com for (j = 0; j < missing[i]; j++) {
113610105Sadam.leventhal@sun.com ASSERT3U(rows[i][j], ==, 0);
113710105Sadam.leventhal@sun.com }
113810105Sadam.leventhal@sun.com ASSERT3U(rows[i][missing[i]], !=, 0);
113910105Sadam.leventhal@sun.com
114010105Sadam.leventhal@sun.com /*
114110105Sadam.leventhal@sun.com * Compute the inverse of the first element and multiply each
114210105Sadam.leventhal@sun.com * element in the row by that value.
114310105Sadam.leventhal@sun.com */
114410105Sadam.leventhal@sun.com log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
114510105Sadam.leventhal@sun.com
114610105Sadam.leventhal@sun.com for (j = 0; j < n; j++) {
114710105Sadam.leventhal@sun.com rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
114810105Sadam.leventhal@sun.com invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
114910105Sadam.leventhal@sun.com }
115010105Sadam.leventhal@sun.com
115110105Sadam.leventhal@sun.com for (ii = 0; ii < nmissing; ii++) {
115210105Sadam.leventhal@sun.com if (i == ii)
115310105Sadam.leventhal@sun.com continue;
115410105Sadam.leventhal@sun.com
115510105Sadam.leventhal@sun.com ASSERT3U(rows[ii][missing[i]], !=, 0);
115610105Sadam.leventhal@sun.com
115710105Sadam.leventhal@sun.com log = vdev_raidz_log2[rows[ii][missing[i]]];
115810105Sadam.leventhal@sun.com
115910105Sadam.leventhal@sun.com for (j = 0; j < n; j++) {
116010105Sadam.leventhal@sun.com rows[ii][j] ^=
116110105Sadam.leventhal@sun.com vdev_raidz_exp2(rows[i][j], log);
116210105Sadam.leventhal@sun.com invrows[ii][j] ^=
116310105Sadam.leventhal@sun.com vdev_raidz_exp2(invrows[i][j], log);
116410105Sadam.leventhal@sun.com }
116510105Sadam.leventhal@sun.com }
116610105Sadam.leventhal@sun.com }
116710105Sadam.leventhal@sun.com
116810105Sadam.leventhal@sun.com /*
116910105Sadam.leventhal@sun.com * Verify that the data that is left in the rows are properly part of
117010105Sadam.leventhal@sun.com * an identity matrix.
117110105Sadam.leventhal@sun.com */
117210105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) {
117310105Sadam.leventhal@sun.com for (j = 0; j < n; j++) {
117410105Sadam.leventhal@sun.com if (j == missing[i]) {
117510105Sadam.leventhal@sun.com ASSERT3U(rows[i][j], ==, 1);
117610105Sadam.leventhal@sun.com } else {
117710105Sadam.leventhal@sun.com ASSERT3U(rows[i][j], ==, 0);
117810105Sadam.leventhal@sun.com }
117910105Sadam.leventhal@sun.com }
118010105Sadam.leventhal@sun.com }
118110105Sadam.leventhal@sun.com }
118210105Sadam.leventhal@sun.com
118310105Sadam.leventhal@sun.com static void
vdev_raidz_matrix_reconstruct(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** invrows,const uint8_t * used)118410105Sadam.leventhal@sun.com vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
118510105Sadam.leventhal@sun.com int *missing, uint8_t **invrows, const uint8_t *used)
118610105Sadam.leventhal@sun.com {
118710105Sadam.leventhal@sun.com int i, j, x, cc, c;
118810105Sadam.leventhal@sun.com uint8_t *src;
118910105Sadam.leventhal@sun.com uint64_t ccount;
119010105Sadam.leventhal@sun.com uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
119110105Sadam.leventhal@sun.com uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
119210105Sadam.leventhal@sun.com uint8_t log, val;
119310105Sadam.leventhal@sun.com int ll;
119410105Sadam.leventhal@sun.com uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
119510105Sadam.leventhal@sun.com uint8_t *p, *pp;
119610105Sadam.leventhal@sun.com size_t psize;
119710105Sadam.leventhal@sun.com
119810105Sadam.leventhal@sun.com psize = sizeof (invlog[0][0]) * n * nmissing;
119910105Sadam.leventhal@sun.com p = kmem_alloc(psize, KM_SLEEP);
120010105Sadam.leventhal@sun.com
120110105Sadam.leventhal@sun.com for (pp = p, i = 0; i < nmissing; i++) {
120210105Sadam.leventhal@sun.com invlog[i] = pp;
120310105Sadam.leventhal@sun.com pp += n;
120410105Sadam.leventhal@sun.com }
120510105Sadam.leventhal@sun.com
120610105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) {
120710105Sadam.leventhal@sun.com for (j = 0; j < n; j++) {
120810105Sadam.leventhal@sun.com ASSERT3U(invrows[i][j], !=, 0);
120910105Sadam.leventhal@sun.com invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
121010105Sadam.leventhal@sun.com }
121110105Sadam.leventhal@sun.com }
121210105Sadam.leventhal@sun.com
121310105Sadam.leventhal@sun.com for (i = 0; i < n; i++) {
121410105Sadam.leventhal@sun.com c = used[i];
121510105Sadam.leventhal@sun.com ASSERT3U(c, <, rm->rm_cols);
121610105Sadam.leventhal@sun.com
121710105Sadam.leventhal@sun.com src = rm->rm_col[c].rc_data;
121810105Sadam.leventhal@sun.com ccount = rm->rm_col[c].rc_size;
121910105Sadam.leventhal@sun.com for (j = 0; j < nmissing; j++) {
122010105Sadam.leventhal@sun.com cc = missing[j] + rm->rm_firstdatacol;
122110105Sadam.leventhal@sun.com ASSERT3U(cc, >=, rm->rm_firstdatacol);
122210105Sadam.leventhal@sun.com ASSERT3U(cc, <, rm->rm_cols);
122310105Sadam.leventhal@sun.com ASSERT3U(cc, !=, c);
122410105Sadam.leventhal@sun.com
122510105Sadam.leventhal@sun.com dst[j] = rm->rm_col[cc].rc_data;
122610105Sadam.leventhal@sun.com dcount[j] = rm->rm_col[cc].rc_size;
122710105Sadam.leventhal@sun.com }
122810105Sadam.leventhal@sun.com
122910105Sadam.leventhal@sun.com ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
123010105Sadam.leventhal@sun.com
123110105Sadam.leventhal@sun.com for (x = 0; x < ccount; x++, src++) {
123210105Sadam.leventhal@sun.com if (*src != 0)
123310105Sadam.leventhal@sun.com log = vdev_raidz_log2[*src];
123410105Sadam.leventhal@sun.com
123510105Sadam.leventhal@sun.com for (cc = 0; cc < nmissing; cc++) {
123610105Sadam.leventhal@sun.com if (x >= dcount[cc])
123710105Sadam.leventhal@sun.com continue;
123810105Sadam.leventhal@sun.com
123910105Sadam.leventhal@sun.com if (*src == 0) {
124010105Sadam.leventhal@sun.com val = 0;
124110105Sadam.leventhal@sun.com } else {
124210105Sadam.leventhal@sun.com if ((ll = log + invlog[cc][i]) >= 255)
124310105Sadam.leventhal@sun.com ll -= 255;
124410105Sadam.leventhal@sun.com val = vdev_raidz_pow2[ll];
124510105Sadam.leventhal@sun.com }
124610105Sadam.leventhal@sun.com
124710105Sadam.leventhal@sun.com if (i == 0)
124810105Sadam.leventhal@sun.com dst[cc][x] = val;
124910105Sadam.leventhal@sun.com else
125010105Sadam.leventhal@sun.com dst[cc][x] ^= val;
125110105Sadam.leventhal@sun.com }
125210105Sadam.leventhal@sun.com }
125310105Sadam.leventhal@sun.com }
125410105Sadam.leventhal@sun.com
125510105Sadam.leventhal@sun.com kmem_free(p, psize);
125610105Sadam.leventhal@sun.com }
125710105Sadam.leventhal@sun.com
125810105Sadam.leventhal@sun.com static int
vdev_raidz_reconstruct_general(raidz_map_t * rm,int * tgts,int ntgts)125910105Sadam.leventhal@sun.com vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
126010105Sadam.leventhal@sun.com {
126110105Sadam.leventhal@sun.com int n, i, c, t, tt;
126210105Sadam.leventhal@sun.com int nmissing_rows;
126310105Sadam.leventhal@sun.com int missing_rows[VDEV_RAIDZ_MAXPARITY];
126410105Sadam.leventhal@sun.com int parity_map[VDEV_RAIDZ_MAXPARITY];
126510105Sadam.leventhal@sun.com
126610105Sadam.leventhal@sun.com uint8_t *p, *pp;
126710105Sadam.leventhal@sun.com size_t psize;
126810105Sadam.leventhal@sun.com
126910105Sadam.leventhal@sun.com uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
127010105Sadam.leventhal@sun.com uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
127110105Sadam.leventhal@sun.com uint8_t *used;
127210105Sadam.leventhal@sun.com
127310105Sadam.leventhal@sun.com int code = 0;
127410105Sadam.leventhal@sun.com
127510105Sadam.leventhal@sun.com
127610105Sadam.leventhal@sun.com n = rm->rm_cols - rm->rm_firstdatacol;
127710105Sadam.leventhal@sun.com
127810105Sadam.leventhal@sun.com /*
127910105Sadam.leventhal@sun.com * Figure out which data columns are missing.
128010105Sadam.leventhal@sun.com */
128110105Sadam.leventhal@sun.com nmissing_rows = 0;
128210105Sadam.leventhal@sun.com for (t = 0; t < ntgts; t++) {
128310105Sadam.leventhal@sun.com if (tgts[t] >= rm->rm_firstdatacol) {
128410105Sadam.leventhal@sun.com missing_rows[nmissing_rows++] =
128510105Sadam.leventhal@sun.com tgts[t] - rm->rm_firstdatacol;
128610105Sadam.leventhal@sun.com }
128710105Sadam.leventhal@sun.com }
128810105Sadam.leventhal@sun.com
128910105Sadam.leventhal@sun.com /*
129010105Sadam.leventhal@sun.com * Figure out which parity columns to use to help generate the missing
129110105Sadam.leventhal@sun.com * data columns.
129210105Sadam.leventhal@sun.com */
129310105Sadam.leventhal@sun.com for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
129410105Sadam.leventhal@sun.com ASSERT(tt < ntgts);
129510105Sadam.leventhal@sun.com ASSERT(c < rm->rm_firstdatacol);
129610105Sadam.leventhal@sun.com
129710105Sadam.leventhal@sun.com /*
129810105Sadam.leventhal@sun.com * Skip any targeted parity columns.
129910105Sadam.leventhal@sun.com */
130010105Sadam.leventhal@sun.com if (c == tgts[tt]) {
130110105Sadam.leventhal@sun.com tt++;
130210105Sadam.leventhal@sun.com continue;
130310105Sadam.leventhal@sun.com }
130410105Sadam.leventhal@sun.com
130510105Sadam.leventhal@sun.com code |= 1 << c;
130610105Sadam.leventhal@sun.com
130710105Sadam.leventhal@sun.com parity_map[i] = c;
130810105Sadam.leventhal@sun.com i++;
130910105Sadam.leventhal@sun.com }
131010105Sadam.leventhal@sun.com
131110105Sadam.leventhal@sun.com ASSERT(code != 0);
131210105Sadam.leventhal@sun.com ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
131310105Sadam.leventhal@sun.com
131410105Sadam.leventhal@sun.com psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
131510105Sadam.leventhal@sun.com nmissing_rows * n + sizeof (used[0]) * n;
131610105Sadam.leventhal@sun.com p = kmem_alloc(psize, KM_SLEEP);
131710105Sadam.leventhal@sun.com
131810105Sadam.leventhal@sun.com for (pp = p, i = 0; i < nmissing_rows; i++) {
131910105Sadam.leventhal@sun.com rows[i] = pp;
132010105Sadam.leventhal@sun.com pp += n;
132110105Sadam.leventhal@sun.com invrows[i] = pp;
132210105Sadam.leventhal@sun.com pp += n;
132310105Sadam.leventhal@sun.com }
132410105Sadam.leventhal@sun.com used = pp;
132510105Sadam.leventhal@sun.com
132610105Sadam.leventhal@sun.com for (i = 0; i < nmissing_rows; i++) {
132710105Sadam.leventhal@sun.com used[i] = parity_map[i];
132810105Sadam.leventhal@sun.com }
132910105Sadam.leventhal@sun.com
133010105Sadam.leventhal@sun.com for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
133110105Sadam.leventhal@sun.com if (tt < nmissing_rows &&
133210105Sadam.leventhal@sun.com c == missing_rows[tt] + rm->rm_firstdatacol) {
133310105Sadam.leventhal@sun.com tt++;
133410105Sadam.leventhal@sun.com continue;
133510105Sadam.leventhal@sun.com }
133610105Sadam.leventhal@sun.com
133710105Sadam.leventhal@sun.com ASSERT3S(i, <, n);
133810105Sadam.leventhal@sun.com used[i] = c;
133910105Sadam.leventhal@sun.com i++;
134010105Sadam.leventhal@sun.com }
134110105Sadam.leventhal@sun.com
134210105Sadam.leventhal@sun.com /*
134310105Sadam.leventhal@sun.com * Initialize the interesting rows of the matrix.
134410105Sadam.leventhal@sun.com */
134510105Sadam.leventhal@sun.com vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
134610105Sadam.leventhal@sun.com
134710105Sadam.leventhal@sun.com /*
134810105Sadam.leventhal@sun.com * Invert the matrix.
134910105Sadam.leventhal@sun.com */
135010105Sadam.leventhal@sun.com vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
135110105Sadam.leventhal@sun.com invrows, used);
135210105Sadam.leventhal@sun.com
135310105Sadam.leventhal@sun.com /*
135410105Sadam.leventhal@sun.com * Reconstruct the missing data using the generated matrix.
135510105Sadam.leventhal@sun.com */
135610105Sadam.leventhal@sun.com vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
135710105Sadam.leventhal@sun.com invrows, used);
135810105Sadam.leventhal@sun.com
135910105Sadam.leventhal@sun.com kmem_free(p, psize);
136010105Sadam.leventhal@sun.com
136110105Sadam.leventhal@sun.com return (code);
136210105Sadam.leventhal@sun.com }
136310105Sadam.leventhal@sun.com
136410105Sadam.leventhal@sun.com static int
vdev_raidz_reconstruct(raidz_map_t * rm,int * t,int nt)136510105Sadam.leventhal@sun.com vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
136610105Sadam.leventhal@sun.com {
136710105Sadam.leventhal@sun.com int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
136810105Sadam.leventhal@sun.com int ntgts;
136910105Sadam.leventhal@sun.com int i, c;
137010105Sadam.leventhal@sun.com int code;
137110105Sadam.leventhal@sun.com int nbadparity, nbaddata;
137210105Sadam.leventhal@sun.com int parity_valid[VDEV_RAIDZ_MAXPARITY];
137310105Sadam.leventhal@sun.com
137410105Sadam.leventhal@sun.com /*
137510105Sadam.leventhal@sun.com * The tgts list must already be sorted.
137610105Sadam.leventhal@sun.com */
137710105Sadam.leventhal@sun.com for (i = 1; i < nt; i++) {
137810105Sadam.leventhal@sun.com ASSERT(t[i] > t[i - 1]);
137910105Sadam.leventhal@sun.com }
138010105Sadam.leventhal@sun.com
138110105Sadam.leventhal@sun.com nbadparity = rm->rm_firstdatacol;
138210105Sadam.leventhal@sun.com nbaddata = rm->rm_cols - nbadparity;
138310105Sadam.leventhal@sun.com ntgts = 0;
138410105Sadam.leventhal@sun.com for (i = 0, c = 0; c < rm->rm_cols; c++) {
138510105Sadam.leventhal@sun.com if (c < rm->rm_firstdatacol)
138610105Sadam.leventhal@sun.com parity_valid[c] = B_FALSE;
138710105Sadam.leventhal@sun.com
138810105Sadam.leventhal@sun.com if (i < nt && c == t[i]) {
138910105Sadam.leventhal@sun.com tgts[ntgts++] = c;
139010105Sadam.leventhal@sun.com i++;
139110105Sadam.leventhal@sun.com } else if (rm->rm_col[c].rc_error != 0) {
139210105Sadam.leventhal@sun.com tgts[ntgts++] = c;
139310105Sadam.leventhal@sun.com } else if (c >= rm->rm_firstdatacol) {
139410105Sadam.leventhal@sun.com nbaddata--;
139510105Sadam.leventhal@sun.com } else {
139610105Sadam.leventhal@sun.com parity_valid[c] = B_TRUE;
139710105Sadam.leventhal@sun.com nbadparity--;
139810105Sadam.leventhal@sun.com }
139910105Sadam.leventhal@sun.com }
140010105Sadam.leventhal@sun.com
140110105Sadam.leventhal@sun.com ASSERT(ntgts >= nt);
140210105Sadam.leventhal@sun.com ASSERT(nbaddata >= 0);
140310105Sadam.leventhal@sun.com ASSERT(nbaddata + nbadparity == ntgts);
140410105Sadam.leventhal@sun.com
140510105Sadam.leventhal@sun.com dt = &tgts[nbadparity];
140610105Sadam.leventhal@sun.com
140710105Sadam.leventhal@sun.com /*
140810105Sadam.leventhal@sun.com * See if we can use any of our optimized reconstruction routines.
140910105Sadam.leventhal@sun.com */
141010105Sadam.leventhal@sun.com if (!vdev_raidz_default_to_general) {
141110105Sadam.leventhal@sun.com switch (nbaddata) {
141210105Sadam.leventhal@sun.com case 1:
141310105Sadam.leventhal@sun.com if (parity_valid[VDEV_RAIDZ_P])
141410105Sadam.leventhal@sun.com return (vdev_raidz_reconstruct_p(rm, dt, 1));
141510105Sadam.leventhal@sun.com
141610105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol > 1);
141710105Sadam.leventhal@sun.com
141810105Sadam.leventhal@sun.com if (parity_valid[VDEV_RAIDZ_Q])
141910105Sadam.leventhal@sun.com return (vdev_raidz_reconstruct_q(rm, dt, 1));
142010105Sadam.leventhal@sun.com
142110105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol > 2);
142210105Sadam.leventhal@sun.com break;
142310105Sadam.leventhal@sun.com
142410105Sadam.leventhal@sun.com case 2:
142510105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol > 1);
142610105Sadam.leventhal@sun.com
142710105Sadam.leventhal@sun.com if (parity_valid[VDEV_RAIDZ_P] &&
142810105Sadam.leventhal@sun.com parity_valid[VDEV_RAIDZ_Q])
142910105Sadam.leventhal@sun.com return (vdev_raidz_reconstruct_pq(rm, dt, 2));
143010105Sadam.leventhal@sun.com
143110105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol > 2);
143210105Sadam.leventhal@sun.com
143310105Sadam.leventhal@sun.com break;
143410105Sadam.leventhal@sun.com }
143510105Sadam.leventhal@sun.com }
143610105Sadam.leventhal@sun.com
143710105Sadam.leventhal@sun.com code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
143810105Sadam.leventhal@sun.com ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
143910105Sadam.leventhal@sun.com ASSERT(code > 0);
144010105Sadam.leventhal@sun.com return (code);
144110105Sadam.leventhal@sun.com }
14422082Seschrock
1443789Sahrens static int
vdev_raidz_open(vdev_t * vd,uint64_t * asize,uint64_t * ashift)1444789Sahrens vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
1445789Sahrens {
144610105Sadam.leventhal@sun.com vdev_t *cvd;
14472082Seschrock uint64_t nparity = vd->vdev_nparity;
144810105Sadam.leventhal@sun.com int c;
1449789Sahrens int lasterror = 0;
1450789Sahrens int numerrors = 0;
1451789Sahrens
14522082Seschrock ASSERT(nparity > 0);
14532082Seschrock
14542082Seschrock if (nparity > VDEV_RAIDZ_MAXPARITY ||
14552082Seschrock vd->vdev_children < nparity + 1) {
1456789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1457789Sahrens return (EINVAL);
1458789Sahrens }
1459789Sahrens
14609846SEric.Taylor@Sun.COM vdev_open_children(vd);
1461789Sahrens
146210105Sadam.leventhal@sun.com for (c = 0; c < vd->vdev_children; c++) {
146310105Sadam.leventhal@sun.com cvd = vd->vdev_child[c];
14649846SEric.Taylor@Sun.COM
146510105Sadam.leventhal@sun.com if (cvd->vdev_open_error != 0) {
14669846SEric.Taylor@Sun.COM lasterror = cvd->vdev_open_error;
1467789Sahrens numerrors++;
1468789Sahrens continue;
1469789Sahrens }
1470789Sahrens
1471789Sahrens *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
14721732Sbonwick *ashift = MAX(*ashift, cvd->vdev_ashift);
1473789Sahrens }
1474789Sahrens
1475789Sahrens *asize *= vd->vdev_children;
1476789Sahrens
14772082Seschrock if (numerrors > nparity) {
1478789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1479789Sahrens return (lasterror);
1480789Sahrens }
1481789Sahrens
1482789Sahrens return (0);
1483789Sahrens }
1484789Sahrens
1485789Sahrens static void
vdev_raidz_close(vdev_t * vd)1486789Sahrens vdev_raidz_close(vdev_t *vd)
1487789Sahrens {
148810105Sadam.leventhal@sun.com int c;
148910105Sadam.leventhal@sun.com
149010105Sadam.leventhal@sun.com for (c = 0; c < vd->vdev_children; c++)
1491789Sahrens vdev_close(vd->vdev_child[c]);
1492789Sahrens }
1493789Sahrens
1494789Sahrens static uint64_t
vdev_raidz_asize(vdev_t * vd,uint64_t psize)1495789Sahrens vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1496789Sahrens {
1497789Sahrens uint64_t asize;
14981732Sbonwick uint64_t ashift = vd->vdev_top->vdev_ashift;
1499789Sahrens uint64_t cols = vd->vdev_children;
15002082Seschrock uint64_t nparity = vd->vdev_nparity;
1501789Sahrens
15021732Sbonwick asize = ((psize - 1) >> ashift) + 1;
15032082Seschrock asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
15042082Seschrock asize = roundup(asize, nparity + 1) << ashift;
1505789Sahrens
1506789Sahrens return (asize);
1507789Sahrens }
1508789Sahrens
1509789Sahrens static void
vdev_raidz_child_done(zio_t * zio)1510789Sahrens vdev_raidz_child_done(zio_t *zio)
1511789Sahrens {
1512789Sahrens raidz_col_t *rc = zio->io_private;
1513789Sahrens
1514789Sahrens rc->rc_error = zio->io_error;
1515789Sahrens rc->rc_tried = 1;
1516789Sahrens rc->rc_skipped = 0;
1517789Sahrens }
1518789Sahrens
15195530Sbonwick static int
vdev_raidz_io_start(zio_t * zio)1520789Sahrens vdev_raidz_io_start(zio_t *zio)
1521789Sahrens {
1522789Sahrens vdev_t *vd = zio->io_vd;
15231732Sbonwick vdev_t *tvd = vd->vdev_top;
1524789Sahrens vdev_t *cvd;
1525789Sahrens raidz_map_t *rm;
1526789Sahrens raidz_col_t *rc;
152710105Sadam.leventhal@sun.com int c, i;
1528789Sahrens
15292082Seschrock rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
15302082Seschrock vd->vdev_nparity);
1531789Sahrens
15321775Sbillm ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1533789Sahrens
1534789Sahrens if (zio->io_type == ZIO_TYPE_WRITE) {
153510105Sadam.leventhal@sun.com vdev_raidz_generate_parity(rm);
1536789Sahrens
1537789Sahrens for (c = 0; c < rm->rm_cols; c++) {
1538789Sahrens rc = &rm->rm_col[c];
15392082Seschrock cvd = vd->vdev_child[rc->rc_devidx];
1540789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1541789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size,
15427754SJeff.Bonwick@Sun.COM zio->io_type, zio->io_priority, 0,
1543789Sahrens vdev_raidz_child_done, rc));
1544789Sahrens }
15455530Sbonwick
154610105Sadam.leventhal@sun.com /*
154710105Sadam.leventhal@sun.com * Generate optional I/Os for any skipped sectors to improve
154810105Sadam.leventhal@sun.com * aggregation contiguity.
154910105Sadam.leventhal@sun.com */
155010450Sadam.leventhal@sun.com for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
155110105Sadam.leventhal@sun.com ASSERT(c <= rm->rm_scols);
155210105Sadam.leventhal@sun.com if (c == rm->rm_scols)
155310105Sadam.leventhal@sun.com c = 0;
155410105Sadam.leventhal@sun.com rc = &rm->rm_col[c];
155510105Sadam.leventhal@sun.com cvd = vd->vdev_child[rc->rc_devidx];
155610105Sadam.leventhal@sun.com zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
155710105Sadam.leventhal@sun.com rc->rc_offset + rc->rc_size, NULL,
155810105Sadam.leventhal@sun.com 1 << tvd->vdev_ashift,
155910105Sadam.leventhal@sun.com zio->io_type, zio->io_priority,
156010105Sadam.leventhal@sun.com ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
156110105Sadam.leventhal@sun.com }
156210105Sadam.leventhal@sun.com
15637754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE);
1564789Sahrens }
1565789Sahrens
1566789Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ);
1567789Sahrens
15682082Seschrock /*
15692082Seschrock * Iterate over the columns in reverse order so that we hit the parity
157010105Sadam.leventhal@sun.com * last -- any errors along the way will force us to read the parity.
15712082Seschrock */
1572789Sahrens for (c = rm->rm_cols - 1; c >= 0; c--) {
1573789Sahrens rc = &rm->rm_col[c];
15742082Seschrock cvd = vd->vdev_child[rc->rc_devidx];
15755329Sgw25295 if (!vdev_readable(cvd)) {
15762082Seschrock if (c >= rm->rm_firstdatacol)
15772082Seschrock rm->rm_missingdata++;
15782082Seschrock else
15792082Seschrock rm->rm_missingparity++;
1580789Sahrens rc->rc_error = ENXIO;
1581789Sahrens rc->rc_tried = 1; /* don't even try */
1582789Sahrens rc->rc_skipped = 1;
1583789Sahrens continue;
1584789Sahrens }
158510922SJeff.Bonwick@Sun.COM if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
15862082Seschrock if (c >= rm->rm_firstdatacol)
15872082Seschrock rm->rm_missingdata++;
15882082Seschrock else
15892082Seschrock rm->rm_missingparity++;
1590789Sahrens rc->rc_error = ESTALE;
1591789Sahrens rc->rc_skipped = 1;
1592789Sahrens continue;
1593789Sahrens }
15942082Seschrock if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
15959434SMark.Musante@Sun.COM (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1596789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1597789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size,
15987754SJeff.Bonwick@Sun.COM zio->io_type, zio->io_priority, 0,
1599789Sahrens vdev_raidz_child_done, rc));
1600789Sahrens }
1601789Sahrens }
1602789Sahrens
16037754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE);
1604789Sahrens }
1605789Sahrens
1606*12296SLin.Ling@Sun.COM
16071544Seschrock /*
16081544Seschrock * Report a checksum error for a child of a RAID-Z device.
16091544Seschrock */
16101544Seschrock static void
raidz_checksum_error(zio_t * zio,raidz_col_t * rc,void * bad_data)161110614SJonathan.Adams@Sun.COM raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
16121544Seschrock {
16132082Seschrock vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
16141544Seschrock
16151544Seschrock if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
161610614SJonathan.Adams@Sun.COM zio_bad_cksum_t zbc;
161710614SJonathan.Adams@Sun.COM raidz_map_t *rm = zio->io_vsd;
161810614SJonathan.Adams@Sun.COM
16191544Seschrock mutex_enter(&vd->vdev_stat_lock);
16201544Seschrock vd->vdev_stat.vs_checksum_errors++;
16211544Seschrock mutex_exit(&vd->vdev_stat_lock);
162210614SJonathan.Adams@Sun.COM
162310614SJonathan.Adams@Sun.COM zbc.zbc_has_cksum = 0;
162410614SJonathan.Adams@Sun.COM zbc.zbc_injected = rm->rm_ecksuminjected;
162510614SJonathan.Adams@Sun.COM
162610614SJonathan.Adams@Sun.COM zfs_ereport_post_checksum(zio->io_spa, vd, zio,
162710614SJonathan.Adams@Sun.COM rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
162810614SJonathan.Adams@Sun.COM &zbc);
16291544Seschrock }
163010614SJonathan.Adams@Sun.COM }
16311544Seschrock
163210614SJonathan.Adams@Sun.COM /*
163310614SJonathan.Adams@Sun.COM * We keep track of whether or not there were any injected errors, so that
163410614SJonathan.Adams@Sun.COM * any ereports we generate can note it.
163510614SJonathan.Adams@Sun.COM */
163610614SJonathan.Adams@Sun.COM static int
raidz_checksum_verify(zio_t * zio)163710614SJonathan.Adams@Sun.COM raidz_checksum_verify(zio_t *zio)
163810614SJonathan.Adams@Sun.COM {
163910614SJonathan.Adams@Sun.COM zio_bad_cksum_t zbc;
164010614SJonathan.Adams@Sun.COM raidz_map_t *rm = zio->io_vsd;
164110614SJonathan.Adams@Sun.COM
164210614SJonathan.Adams@Sun.COM int ret = zio_checksum_error(zio, &zbc);
164310614SJonathan.Adams@Sun.COM if (ret != 0 && zbc.zbc_injected != 0)
164410614SJonathan.Adams@Sun.COM rm->rm_ecksuminjected = 1;
164510614SJonathan.Adams@Sun.COM
164610614SJonathan.Adams@Sun.COM return (ret);
16471544Seschrock }
16481544Seschrock
16492082Seschrock /*
16502082Seschrock * Generate the parity from the data columns. If we tried and were able to
16512082Seschrock * read the parity without error, verify that the generated parity matches the
16522082Seschrock * data we read. If it doesn't, we fire off a checksum error. Return the
16532082Seschrock * number such failures.
16542082Seschrock */
16552082Seschrock static int
raidz_parity_verify(zio_t * zio,raidz_map_t * rm)16562082Seschrock raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
16572082Seschrock {
16582082Seschrock void *orig[VDEV_RAIDZ_MAXPARITY];
16592082Seschrock int c, ret = 0;
16602082Seschrock raidz_col_t *rc;
16612082Seschrock
16622082Seschrock for (c = 0; c < rm->rm_firstdatacol; c++) {
16632082Seschrock rc = &rm->rm_col[c];
16642082Seschrock if (!rc->rc_tried || rc->rc_error != 0)
16652082Seschrock continue;
16662082Seschrock orig[c] = zio_buf_alloc(rc->rc_size);
16672082Seschrock bcopy(rc->rc_data, orig[c], rc->rc_size);
16682082Seschrock }
16692082Seschrock
167010105Sadam.leventhal@sun.com vdev_raidz_generate_parity(rm);
16712082Seschrock
16722082Seschrock for (c = 0; c < rm->rm_firstdatacol; c++) {
16732082Seschrock rc = &rm->rm_col[c];
16742082Seschrock if (!rc->rc_tried || rc->rc_error != 0)
16752082Seschrock continue;
16762082Seschrock if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
167710614SJonathan.Adams@Sun.COM raidz_checksum_error(zio, rc, orig[c]);
16782082Seschrock rc->rc_error = ECKSUM;
16792082Seschrock ret++;
16802082Seschrock }
16812082Seschrock zio_buf_free(orig[c], rc->rc_size);
16822082Seschrock }
16832082Seschrock
16842082Seschrock return (ret);
16852082Seschrock }
16862082Seschrock
168710105Sadam.leventhal@sun.com /*
168810105Sadam.leventhal@sun.com * Keep statistics on all the ways that we used parity to correct data.
168910105Sadam.leventhal@sun.com */
169010105Sadam.leventhal@sun.com static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
16911544Seschrock
16925530Sbonwick static int
vdev_raidz_worst_error(raidz_map_t * rm)16937754SJeff.Bonwick@Sun.COM vdev_raidz_worst_error(raidz_map_t *rm)
16947754SJeff.Bonwick@Sun.COM {
16957754SJeff.Bonwick@Sun.COM int error = 0;
16967754SJeff.Bonwick@Sun.COM
16977754SJeff.Bonwick@Sun.COM for (int c = 0; c < rm->rm_cols; c++)
16987754SJeff.Bonwick@Sun.COM error = zio_worst_error(error, rm->rm_col[c].rc_error);
16997754SJeff.Bonwick@Sun.COM
17007754SJeff.Bonwick@Sun.COM return (error);
17017754SJeff.Bonwick@Sun.COM }
17027754SJeff.Bonwick@Sun.COM
170310105Sadam.leventhal@sun.com /*
170410105Sadam.leventhal@sun.com * Iterate over all combinations of bad data and attempt a reconstruction.
170510105Sadam.leventhal@sun.com * Note that the algorithm below is non-optimal because it doesn't take into
170610105Sadam.leventhal@sun.com * account how reconstruction is actually performed. For example, with
170710105Sadam.leventhal@sun.com * triple-parity RAID-Z the reconstruction procedure is the same if column 4
170810105Sadam.leventhal@sun.com * is targeted as invalid as if columns 1 and 4 are targeted since in both
170910105Sadam.leventhal@sun.com * cases we'd only use parity information in column 0.
171010105Sadam.leventhal@sun.com */
171110105Sadam.leventhal@sun.com static int
vdev_raidz_combrec(zio_t * zio,int total_errors,int data_errors)171210105Sadam.leventhal@sun.com vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
171310105Sadam.leventhal@sun.com {
171410105Sadam.leventhal@sun.com raidz_map_t *rm = zio->io_vsd;
171510105Sadam.leventhal@sun.com raidz_col_t *rc;
171610105Sadam.leventhal@sun.com void *orig[VDEV_RAIDZ_MAXPARITY];
171710105Sadam.leventhal@sun.com int tstore[VDEV_RAIDZ_MAXPARITY + 2];
171810105Sadam.leventhal@sun.com int *tgts = &tstore[1];
171910105Sadam.leventhal@sun.com int current, next, i, c, n;
172010105Sadam.leventhal@sun.com int code, ret = 0;
172110105Sadam.leventhal@sun.com
172210105Sadam.leventhal@sun.com ASSERT(total_errors < rm->rm_firstdatacol);
172310105Sadam.leventhal@sun.com
172410105Sadam.leventhal@sun.com /*
172510105Sadam.leventhal@sun.com * This simplifies one edge condition.
172610105Sadam.leventhal@sun.com */
172710105Sadam.leventhal@sun.com tgts[-1] = -1;
172810105Sadam.leventhal@sun.com
172910105Sadam.leventhal@sun.com for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
173010105Sadam.leventhal@sun.com /*
173110105Sadam.leventhal@sun.com * Initialize the targets array by finding the first n columns
173210105Sadam.leventhal@sun.com * that contain no error.
173310105Sadam.leventhal@sun.com *
173410105Sadam.leventhal@sun.com * If there were no data errors, we need to ensure that we're
173510105Sadam.leventhal@sun.com * always explicitly attempting to reconstruct at least one
173610105Sadam.leventhal@sun.com * data column. To do this, we simply push the highest target
173710105Sadam.leventhal@sun.com * up into the data columns.
173810105Sadam.leventhal@sun.com */
173910105Sadam.leventhal@sun.com for (c = 0, i = 0; i < n; i++) {
174010105Sadam.leventhal@sun.com if (i == n - 1 && data_errors == 0 &&
174110105Sadam.leventhal@sun.com c < rm->rm_firstdatacol) {
174210105Sadam.leventhal@sun.com c = rm->rm_firstdatacol;
174310105Sadam.leventhal@sun.com }
174410105Sadam.leventhal@sun.com
174510105Sadam.leventhal@sun.com while (rm->rm_col[c].rc_error != 0) {
174610105Sadam.leventhal@sun.com c++;
174710105Sadam.leventhal@sun.com ASSERT3S(c, <, rm->rm_cols);
174810105Sadam.leventhal@sun.com }
174910105Sadam.leventhal@sun.com
175010105Sadam.leventhal@sun.com tgts[i] = c++;
175110105Sadam.leventhal@sun.com }
175210105Sadam.leventhal@sun.com
175310105Sadam.leventhal@sun.com /*
175410105Sadam.leventhal@sun.com * Setting tgts[n] simplifies the other edge condition.
175510105Sadam.leventhal@sun.com */
175610105Sadam.leventhal@sun.com tgts[n] = rm->rm_cols;
175710105Sadam.leventhal@sun.com
175810105Sadam.leventhal@sun.com /*
175910105Sadam.leventhal@sun.com * These buffers were allocated in previous iterations.
176010105Sadam.leventhal@sun.com */
176110105Sadam.leventhal@sun.com for (i = 0; i < n - 1; i++) {
176210105Sadam.leventhal@sun.com ASSERT(orig[i] != NULL);
176310105Sadam.leventhal@sun.com }
176410105Sadam.leventhal@sun.com
176510105Sadam.leventhal@sun.com orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
176610105Sadam.leventhal@sun.com
176710105Sadam.leventhal@sun.com current = 0;
176810105Sadam.leventhal@sun.com next = tgts[current];
176910105Sadam.leventhal@sun.com
177010105Sadam.leventhal@sun.com while (current != n) {
177110105Sadam.leventhal@sun.com tgts[current] = next;
177210105Sadam.leventhal@sun.com current = 0;
177310105Sadam.leventhal@sun.com
177410105Sadam.leventhal@sun.com /*
177510105Sadam.leventhal@sun.com * Save off the original data that we're going to
177610105Sadam.leventhal@sun.com * attempt to reconstruct.
177710105Sadam.leventhal@sun.com */
177810105Sadam.leventhal@sun.com for (i = 0; i < n; i++) {
177910105Sadam.leventhal@sun.com ASSERT(orig[i] != NULL);
178010105Sadam.leventhal@sun.com c = tgts[i];
178110105Sadam.leventhal@sun.com ASSERT3S(c, >=, 0);
178210105Sadam.leventhal@sun.com ASSERT3S(c, <, rm->rm_cols);
178310105Sadam.leventhal@sun.com rc = &rm->rm_col[c];
178410105Sadam.leventhal@sun.com bcopy(rc->rc_data, orig[i], rc->rc_size);
178510105Sadam.leventhal@sun.com }
178610105Sadam.leventhal@sun.com
178710105Sadam.leventhal@sun.com /*
178810105Sadam.leventhal@sun.com * Attempt a reconstruction and exit the outer loop on
178910105Sadam.leventhal@sun.com * success.
179010105Sadam.leventhal@sun.com */
179110105Sadam.leventhal@sun.com code = vdev_raidz_reconstruct(rm, tgts, n);
179210614SJonathan.Adams@Sun.COM if (raidz_checksum_verify(zio) == 0) {
179310105Sadam.leventhal@sun.com atomic_inc_64(&raidz_corrected[code]);
179410105Sadam.leventhal@sun.com
179510105Sadam.leventhal@sun.com for (i = 0; i < n; i++) {
179610105Sadam.leventhal@sun.com c = tgts[i];
179710105Sadam.leventhal@sun.com rc = &rm->rm_col[c];
179810105Sadam.leventhal@sun.com ASSERT(rc->rc_error == 0);
179910614SJonathan.Adams@Sun.COM if (rc->rc_tried)
180010614SJonathan.Adams@Sun.COM raidz_checksum_error(zio, rc,
180110614SJonathan.Adams@Sun.COM orig[i]);
180210105Sadam.leventhal@sun.com rc->rc_error = ECKSUM;
180310105Sadam.leventhal@sun.com }
180410105Sadam.leventhal@sun.com
180510105Sadam.leventhal@sun.com ret = code;
180610105Sadam.leventhal@sun.com goto done;
180710105Sadam.leventhal@sun.com }
180810105Sadam.leventhal@sun.com
180910105Sadam.leventhal@sun.com /*
181010105Sadam.leventhal@sun.com * Restore the original data.
181110105Sadam.leventhal@sun.com */
181210105Sadam.leventhal@sun.com for (i = 0; i < n; i++) {
181310105Sadam.leventhal@sun.com c = tgts[i];
181410105Sadam.leventhal@sun.com rc = &rm->rm_col[c];
181510105Sadam.leventhal@sun.com bcopy(orig[i], rc->rc_data, rc->rc_size);
181610105Sadam.leventhal@sun.com }
181710105Sadam.leventhal@sun.com
181810105Sadam.leventhal@sun.com do {
181910105Sadam.leventhal@sun.com /*
182010105Sadam.leventhal@sun.com * Find the next valid column after the current
182110105Sadam.leventhal@sun.com * position..
182210105Sadam.leventhal@sun.com */
182310105Sadam.leventhal@sun.com for (next = tgts[current] + 1;
182410105Sadam.leventhal@sun.com next < rm->rm_cols &&
182510105Sadam.leventhal@sun.com rm->rm_col[next].rc_error != 0; next++)
182610105Sadam.leventhal@sun.com continue;
182710105Sadam.leventhal@sun.com
182810105Sadam.leventhal@sun.com ASSERT(next <= tgts[current + 1]);
182910105Sadam.leventhal@sun.com
183010105Sadam.leventhal@sun.com /*
183110105Sadam.leventhal@sun.com * If that spot is available, we're done here.
183210105Sadam.leventhal@sun.com */
183310105Sadam.leventhal@sun.com if (next != tgts[current + 1])
183410105Sadam.leventhal@sun.com break;
183510105Sadam.leventhal@sun.com
183610105Sadam.leventhal@sun.com /*
183710105Sadam.leventhal@sun.com * Otherwise, find the next valid column after
183810105Sadam.leventhal@sun.com * the previous position.
183910105Sadam.leventhal@sun.com */
184010105Sadam.leventhal@sun.com for (c = tgts[current - 1] + 1;
184110105Sadam.leventhal@sun.com rm->rm_col[c].rc_error != 0; c++)
184210105Sadam.leventhal@sun.com continue;
184310105Sadam.leventhal@sun.com
184410105Sadam.leventhal@sun.com tgts[current] = c;
184510105Sadam.leventhal@sun.com current++;
184610105Sadam.leventhal@sun.com
184710105Sadam.leventhal@sun.com } while (current != n);
184810105Sadam.leventhal@sun.com }
184910105Sadam.leventhal@sun.com }
185010105Sadam.leventhal@sun.com n--;
185110105Sadam.leventhal@sun.com done:
185210105Sadam.leventhal@sun.com for (i = 0; i < n; i++) {
185310105Sadam.leventhal@sun.com zio_buf_free(orig[i], rm->rm_col[0].rc_size);
185410105Sadam.leventhal@sun.com }
185510105Sadam.leventhal@sun.com
185610105Sadam.leventhal@sun.com return (ret);
185710105Sadam.leventhal@sun.com }
185810105Sadam.leventhal@sun.com
18597754SJeff.Bonwick@Sun.COM static void
vdev_raidz_io_done(zio_t * zio)1860789Sahrens vdev_raidz_io_done(zio_t *zio)
1861789Sahrens {
1862789Sahrens vdev_t *vd = zio->io_vd;
1863789Sahrens vdev_t *cvd;
1864789Sahrens raidz_map_t *rm = zio->io_vsd;
186510105Sadam.leventhal@sun.com raidz_col_t *rc;
1866789Sahrens int unexpected_errors = 0;
18672082Seschrock int parity_errors = 0;
18683456Sahl int parity_untried = 0;
18692082Seschrock int data_errors = 0;
18707754SJeff.Bonwick@Sun.COM int total_errors = 0;
187110105Sadam.leventhal@sun.com int n, c;
187210105Sadam.leventhal@sun.com int tgts[VDEV_RAIDZ_MAXPARITY];
187310105Sadam.leventhal@sun.com int code;
1874789Sahrens
18751775Sbillm ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
1876789Sahrens
18772082Seschrock ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
18782082Seschrock ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
18792082Seschrock
1880789Sahrens for (c = 0; c < rm->rm_cols; c++) {
1881789Sahrens rc = &rm->rm_col[c];
1882789Sahrens
1883789Sahrens if (rc->rc_error) {
18847754SJeff.Bonwick@Sun.COM ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
18852082Seschrock
18862082Seschrock if (c < rm->rm_firstdatacol)
18872082Seschrock parity_errors++;
18882082Seschrock else
18892082Seschrock data_errors++;
18902082Seschrock
1891789Sahrens if (!rc->rc_skipped)
1892789Sahrens unexpected_errors++;
18932082Seschrock
18947754SJeff.Bonwick@Sun.COM total_errors++;
18953456Sahl } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
18963456Sahl parity_untried++;
1897789Sahrens }
1898789Sahrens }
1899789Sahrens
1900789Sahrens if (zio->io_type == ZIO_TYPE_WRITE) {
1901789Sahrens /*
19027754SJeff.Bonwick@Sun.COM * XXX -- for now, treat partial writes as a success.
19037754SJeff.Bonwick@Sun.COM * (If we couldn't write enough columns to reconstruct
19047754SJeff.Bonwick@Sun.COM * the data, the I/O failed. Otherwise, good enough.)
19057754SJeff.Bonwick@Sun.COM *
19067754SJeff.Bonwick@Sun.COM * Now that we support write reallocation, it would be better
19077754SJeff.Bonwick@Sun.COM * to treat partial failure as real failure unless there are
19087754SJeff.Bonwick@Sun.COM * no non-degraded top-level vdevs left, and not update DTLs
19097754SJeff.Bonwick@Sun.COM * if we intend to reallocate.
1910789Sahrens */
1911789Sahrens /* XXPOLICY */
19127754SJeff.Bonwick@Sun.COM if (total_errors > rm->rm_firstdatacol)
19137754SJeff.Bonwick@Sun.COM zio->io_error = vdev_raidz_worst_error(rm);
1914789Sahrens
19157754SJeff.Bonwick@Sun.COM return;
1916789Sahrens }
1917789Sahrens
1918789Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ);
19192082Seschrock /*
19202082Seschrock * There are three potential phases for a read:
19212082Seschrock * 1. produce valid data from the columns read
19222082Seschrock * 2. read all disks and try again
19232082Seschrock * 3. perform combinatorial reconstruction
19242082Seschrock *
19252082Seschrock * Each phase is progressively both more expensive and less likely to
19262082Seschrock * occur. If we encounter more errors than we can repair or all phases
19272082Seschrock * fail, we have no choice but to return an error.
19282082Seschrock */
1929789Sahrens
1930789Sahrens /*
19312082Seschrock * If the number of errors we saw was correctable -- less than or equal
19323456Sahl * to the number of parity disks read -- attempt to produce data that
19333456Sahl * has a valid checksum. Naturally, this case applies in the absence of
19343456Sahl * any errors.
1935789Sahrens */
19367754SJeff.Bonwick@Sun.COM if (total_errors <= rm->rm_firstdatacol - parity_untried) {
193710105Sadam.leventhal@sun.com if (data_errors == 0) {
193810614SJonathan.Adams@Sun.COM if (raidz_checksum_verify(zio) == 0) {
19394034Sahl /*
19404034Sahl * If we read parity information (unnecessarily
19414034Sahl * as it happens since no reconstruction was
19424034Sahl * needed) regenerate and verify the parity.
19434034Sahl * We also regenerate parity when resilvering
19444034Sahl * so we can write it out to the failed device
19454034Sahl * later.
19464034Sahl */
19473456Sahl if (parity_errors + parity_untried <
19484034Sahl rm->rm_firstdatacol ||
19494034Sahl (zio->io_flags & ZIO_FLAG_RESILVER)) {
19503456Sahl n = raidz_parity_verify(zio, rm);
19513456Sahl unexpected_errors += n;
19523456Sahl ASSERT(parity_errors + n <=
19533456Sahl rm->rm_firstdatacol);
19543456Sahl }
19552082Seschrock goto done;
19562082Seschrock }
195710105Sadam.leventhal@sun.com } else {
19583456Sahl /*
19593456Sahl * We either attempt to read all the parity columns or
19603456Sahl * none of them. If we didn't try to read parity, we
19613456Sahl * wouldn't be here in the correctable case. There must
19623456Sahl * also have been fewer parity errors than parity
19633456Sahl * columns or, again, we wouldn't be in this code path.
19643456Sahl */
19653456Sahl ASSERT(parity_untried == 0);
19662082Seschrock ASSERT(parity_errors < rm->rm_firstdatacol);
19672082Seschrock
19682082Seschrock /*
196910105Sadam.leventhal@sun.com * Identify the data columns that reported an error.
19702082Seschrock */
197110105Sadam.leventhal@sun.com n = 0;
19722082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
19732082Seschrock rc = &rm->rm_col[c];
197410105Sadam.leventhal@sun.com if (rc->rc_error != 0) {
197510105Sadam.leventhal@sun.com ASSERT(n < VDEV_RAIDZ_MAXPARITY);
197610105Sadam.leventhal@sun.com tgts[n++] = c;
197710105Sadam.leventhal@sun.com }
19782082Seschrock }
19792082Seschrock
198010105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol >= n);
198110105Sadam.leventhal@sun.com
198210105Sadam.leventhal@sun.com code = vdev_raidz_reconstruct(rm, tgts, n);
19832082Seschrock
198410614SJonathan.Adams@Sun.COM if (raidz_checksum_verify(zio) == 0) {
198510105Sadam.leventhal@sun.com atomic_inc_64(&raidz_corrected[code]);
1986789Sahrens
19872082Seschrock /*
198810105Sadam.leventhal@sun.com * If we read more parity disks than were used
198910105Sadam.leventhal@sun.com * for reconstruction, confirm that the other
199010105Sadam.leventhal@sun.com * parity disks produced correct data. This
199110105Sadam.leventhal@sun.com * routine is suboptimal in that it regenerates
199210105Sadam.leventhal@sun.com * the parity that we already used in addition
199310105Sadam.leventhal@sun.com * to the parity that we're attempting to
199410105Sadam.leventhal@sun.com * verify, but this should be a relatively
199510105Sadam.leventhal@sun.com * uncommon case, and can be optimized if it
199610105Sadam.leventhal@sun.com * becomes a problem. Note that we regenerate
199710105Sadam.leventhal@sun.com * parity when resilvering so we can write it
199810105Sadam.leventhal@sun.com * out to failed devices later.
19992082Seschrock */
200010105Sadam.leventhal@sun.com if (parity_errors < rm->rm_firstdatacol - n ||
20014034Sahl (zio->io_flags & ZIO_FLAG_RESILVER)) {
20022082Seschrock n = raidz_parity_verify(zio, rm);
20032082Seschrock unexpected_errors += n;
20042082Seschrock ASSERT(parity_errors + n <=
20052082Seschrock rm->rm_firstdatacol);
20062082Seschrock }
20072082Seschrock
20082082Seschrock goto done;
20092082Seschrock }
2010789Sahrens }
2011789Sahrens }
2012789Sahrens
2013789Sahrens /*
20142082Seschrock * This isn't a typical situation -- either we got a read error or
20152082Seschrock * a child silently returned bad data. Read every block so we can
20162082Seschrock * try again with as much data and parity as we can track down. If
20172082Seschrock * we've already been through once before, all children will be marked
20182082Seschrock * as tried so we'll proceed to combinatorial reconstruction.
2019789Sahrens */
2020789Sahrens unexpected_errors = 1;
20212082Seschrock rm->rm_missingdata = 0;
20222082Seschrock rm->rm_missingparity = 0;
2023789Sahrens
20242082Seschrock for (c = 0; c < rm->rm_cols; c++) {
20252082Seschrock if (rm->rm_col[c].rc_tried)
20262082Seschrock continue;
2027789Sahrens
2028789Sahrens zio_vdev_io_redone(zio);
20292082Seschrock do {
2030789Sahrens rc = &rm->rm_col[c];
2031789Sahrens if (rc->rc_tried)
2032789Sahrens continue;
2033789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL,
20342082Seschrock vd->vdev_child[rc->rc_devidx],
2035789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size,
20367754SJeff.Bonwick@Sun.COM zio->io_type, zio->io_priority, 0,
2037789Sahrens vdev_raidz_child_done, rc));
20382082Seschrock } while (++c < rm->rm_cols);
20395530Sbonwick
20407754SJeff.Bonwick@Sun.COM return;
2041789Sahrens }
2042789Sahrens
2043789Sahrens /*
20442082Seschrock * At this point we've attempted to reconstruct the data given the
20452082Seschrock * errors we detected, and we've attempted to read all columns. There
20462082Seschrock * must, therefore, be one or more additional problems -- silent errors
20472082Seschrock * resulting in invalid data rather than explicit I/O errors resulting
204810105Sadam.leventhal@sun.com * in absent data. We check if there is enough additional data to
204910105Sadam.leventhal@sun.com * possibly reconstruct the data and then perform combinatorial
205010105Sadam.leventhal@sun.com * reconstruction over all possible combinations. If that fails,
205110105Sadam.leventhal@sun.com * we're cooked.
2052789Sahrens */
205310614SJonathan.Adams@Sun.COM if (total_errors > rm->rm_firstdatacol) {
20547754SJeff.Bonwick@Sun.COM zio->io_error = vdev_raidz_worst_error(rm);
20552082Seschrock
205610614SJonathan.Adams@Sun.COM } else if (total_errors < rm->rm_firstdatacol &&
205710614SJonathan.Adams@Sun.COM (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
20582082Seschrock /*
205910105Sadam.leventhal@sun.com * If we didn't use all the available parity for the
206010105Sadam.leventhal@sun.com * combinatorial reconstruction, verify that the remaining
206110105Sadam.leventhal@sun.com * parity is correct.
20622082Seschrock */
206310105Sadam.leventhal@sun.com if (code != (1 << rm->rm_firstdatacol) - 1)
206410105Sadam.leventhal@sun.com (void) raidz_parity_verify(zio, rm);
206510105Sadam.leventhal@sun.com } else {
206610105Sadam.leventhal@sun.com /*
206710614SJonathan.Adams@Sun.COM * We're here because either:
206810614SJonathan.Adams@Sun.COM *
206910614SJonathan.Adams@Sun.COM * total_errors == rm_first_datacol, or
207010614SJonathan.Adams@Sun.COM * vdev_raidz_combrec() failed
207110614SJonathan.Adams@Sun.COM *
207210614SJonathan.Adams@Sun.COM * In either case, there is enough bad data to prevent
207310614SJonathan.Adams@Sun.COM * reconstruction.
207410614SJonathan.Adams@Sun.COM *
207510614SJonathan.Adams@Sun.COM * Start checksum ereports for all children which haven't
207611670SNeil.Perrin@Sun.COM * failed, and the IO wasn't speculative.
207710105Sadam.leventhal@sun.com */
207810105Sadam.leventhal@sun.com zio->io_error = ECKSUM;
20792082Seschrock
208011670SNeil.Perrin@Sun.COM if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
208111670SNeil.Perrin@Sun.COM for (c = 0; c < rm->rm_cols; c++) {
208211670SNeil.Perrin@Sun.COM rc = &rm->rm_col[c];
208311670SNeil.Perrin@Sun.COM if (rc->rc_error == 0) {
208411670SNeil.Perrin@Sun.COM zio_bad_cksum_t zbc;
208511670SNeil.Perrin@Sun.COM zbc.zbc_has_cksum = 0;
208611670SNeil.Perrin@Sun.COM zbc.zbc_injected =
208711670SNeil.Perrin@Sun.COM rm->rm_ecksuminjected;
208810614SJonathan.Adams@Sun.COM
208911670SNeil.Perrin@Sun.COM zfs_ereport_start_checksum(
209011670SNeil.Perrin@Sun.COM zio->io_spa,
209111670SNeil.Perrin@Sun.COM vd->vdev_child[rc->rc_devidx],
209211670SNeil.Perrin@Sun.COM zio, rc->rc_offset, rc->rc_size,
209311670SNeil.Perrin@Sun.COM (void *)(uintptr_t)c, &zbc);
209411670SNeil.Perrin@Sun.COM }
20952082Seschrock }
20961544Seschrock }
20971544Seschrock }
2098789Sahrens
2099789Sahrens done:
2100789Sahrens zio_checksum_verified(zio);
2101789Sahrens
21028241SJeff.Bonwick@Sun.COM if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2103789Sahrens (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2104789Sahrens /*
2105789Sahrens * Use the good data we have in hand to repair damaged children.
2106789Sahrens */
2107789Sahrens for (c = 0; c < rm->rm_cols; c++) {
2108789Sahrens rc = &rm->rm_col[c];
21092082Seschrock cvd = vd->vdev_child[rc->rc_devidx];
2110789Sahrens
21111732Sbonwick if (rc->rc_error == 0)
21121732Sbonwick continue;
21131732Sbonwick
21147754SJeff.Bonwick@Sun.COM zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
21151732Sbonwick rc->rc_offset, rc->rc_data, rc->rc_size,
21161732Sbonwick ZIO_TYPE_WRITE, zio->io_priority,
21178241SJeff.Bonwick@Sun.COM ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
21188241SJeff.Bonwick@Sun.COM ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
21191732Sbonwick }
2120789Sahrens }
2121789Sahrens }
2122789Sahrens
2123789Sahrens static void
vdev_raidz_state_change(vdev_t * vd,int faulted,int degraded)2124789Sahrens vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2125789Sahrens {
21262082Seschrock if (faulted > vd->vdev_nparity)
21271544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
21281544Seschrock VDEV_AUX_NO_REPLICAS);
2129789Sahrens else if (degraded + faulted != 0)
21301544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2131789Sahrens else
21321544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2133789Sahrens }
2134789Sahrens
2135789Sahrens vdev_ops_t vdev_raidz_ops = {
2136789Sahrens vdev_raidz_open,
2137789Sahrens vdev_raidz_close,
2138789Sahrens vdev_raidz_asize,
2139789Sahrens vdev_raidz_io_start,
2140789Sahrens vdev_raidz_io_done,
2141789Sahrens vdev_raidz_state_change,
214211958SGeorge.Wilson@Sun.COM NULL,
214311958SGeorge.Wilson@Sun.COM NULL,
2144789Sahrens VDEV_TYPE_RAIDZ, /* name of this vdev type */
2145789Sahrens B_FALSE /* not a leaf vdev */
2146789Sahrens };
2147