xref: /onnv-gate/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision 10450:c383b4d6980f)
1789Sahrens /*
2789Sahrens  * CDDL HEADER START
3789Sahrens  *
4789Sahrens  * The contents of this file are subject to the terms of the
51544Seschrock  * Common Development and Distribution License (the "License").
61544Seschrock  * You may not use this file except in compliance with the License.
7789Sahrens  *
8789Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9789Sahrens  * or http://www.opensolaris.org/os/licensing.
10789Sahrens  * See the License for the specific language governing permissions
11789Sahrens  * and limitations under the License.
12789Sahrens  *
13789Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14789Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15789Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16789Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17789Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18789Sahrens  *
19789Sahrens  * CDDL HEADER END
20789Sahrens  */
212082Seschrock 
22789Sahrens /*
239434SMark.Musante@Sun.COM  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24789Sahrens  * Use is subject to license terms.
25789Sahrens  */
26789Sahrens 
27789Sahrens #include <sys/zfs_context.h>
28789Sahrens #include <sys/spa.h>
29789Sahrens #include <sys/vdev_impl.h>
30789Sahrens #include <sys/zio.h>
31789Sahrens #include <sys/zio_checksum.h>
32789Sahrens #include <sys/fs/zfs.h>
331544Seschrock #include <sys/fm/fs/zfs.h>
34789Sahrens 
35789Sahrens /*
36789Sahrens  * Virtual device vector for RAID-Z.
372082Seschrock  *
3810105Sadam.leventhal@sun.com  * This vdev supports single, double, and triple parity. For single parity,
3910105Sadam.leventhal@sun.com  * we use a simple XOR of all the data columns. For double or triple parity,
4010105Sadam.leventhal@sun.com  * we use a special case of Reed-Solomon coding. This extends the
4110105Sadam.leventhal@sun.com  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
4210105Sadam.leventhal@sun.com  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
4310105Sadam.leventhal@sun.com  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
4410105Sadam.leventhal@sun.com  * former is also based. The latter is designed to provide higher performance
4510105Sadam.leventhal@sun.com  * for writes.
4610105Sadam.leventhal@sun.com  *
4710105Sadam.leventhal@sun.com  * Note that the Plank paper claimed to support arbitrary N+M, but was then
4810105Sadam.leventhal@sun.com  * amended six years later identifying a critical flaw that invalidates its
4910105Sadam.leventhal@sun.com  * claims. Nevertheless, the technique can be adapted to work for up to
5010105Sadam.leventhal@sun.com  * triple parity. For additional parity, the amendment "Note: Correction to
5110105Sadam.leventhal@sun.com  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
5210105Sadam.leventhal@sun.com  * is viable, but the additional complexity means that write performance will
5310105Sadam.leventhal@sun.com  * suffer.
5410105Sadam.leventhal@sun.com  *
5510105Sadam.leventhal@sun.com  * All of the methods above operate on a Galois field, defined over the
5610105Sadam.leventhal@sun.com  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
5710105Sadam.leventhal@sun.com  * can be expressed with a single byte. Briefly, the operations on the
5810105Sadam.leventhal@sun.com  * field are defined as follows:
592082Seschrock  *
602082Seschrock  *   o addition (+) is represented by a bitwise XOR
612082Seschrock  *   o subtraction (-) is therefore identical to addition: A + B = A - B
622082Seschrock  *   o multiplication of A by 2 is defined by the following bitwise expression:
632082Seschrock  *	(A * 2)_7 = A_6
642082Seschrock  *	(A * 2)_6 = A_5
652082Seschrock  *	(A * 2)_5 = A_4
662082Seschrock  *	(A * 2)_4 = A_3 + A_7
672082Seschrock  *	(A * 2)_3 = A_2 + A_7
682082Seschrock  *	(A * 2)_2 = A_1 + A_7
692082Seschrock  *	(A * 2)_1 = A_0
702082Seschrock  *	(A * 2)_0 = A_7
712082Seschrock  *
722082Seschrock  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
7310105Sadam.leventhal@sun.com  * As an aside, this multiplication is derived from the error correcting
7410105Sadam.leventhal@sun.com  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
752082Seschrock  *
762082Seschrock  * Observe that any number in the field (except for 0) can be expressed as a
772082Seschrock  * power of 2 -- a generator for the field. We store a table of the powers of
782082Seschrock  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
792082Seschrock  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
8010105Sadam.leventhal@sun.com  * than field addition). The inverse of a field element A (A^-1) is therefore
8110105Sadam.leventhal@sun.com  * A ^ (255 - 1) = A^254.
822082Seschrock  *
8310105Sadam.leventhal@sun.com  * The up-to-three parity columns, P, Q, R over several data columns,
8410105Sadam.leventhal@sun.com  * D_0, ... D_n-1, can be expressed by field operations:
852082Seschrock  *
862082Seschrock  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
872082Seschrock  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
882082Seschrock  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
8910105Sadam.leventhal@sun.com  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
9010105Sadam.leventhal@sun.com  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
912082Seschrock  *
9210105Sadam.leventhal@sun.com  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
9310105Sadam.leventhal@sun.com  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
9410105Sadam.leventhal@sun.com  * independent coefficients. (There are no additional coefficients that have
9510105Sadam.leventhal@sun.com  * this property which is why the uncorrected Plank method breaks down.)
9610105Sadam.leventhal@sun.com  *
9710105Sadam.leventhal@sun.com  * See the reconstruction code below for how P, Q and R can used individually
9810105Sadam.leventhal@sun.com  * or in concert to recover missing data columns.
99789Sahrens  */
100789Sahrens 
101789Sahrens typedef struct raidz_col {
1022082Seschrock 	uint64_t rc_devidx;		/* child device index for I/O */
1032082Seschrock 	uint64_t rc_offset;		/* device offset */
1042082Seschrock 	uint64_t rc_size;		/* I/O size */
1052082Seschrock 	void *rc_data;			/* I/O data */
1062082Seschrock 	int rc_error;			/* I/O error for this device */
1072082Seschrock 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
1082082Seschrock 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
109789Sahrens } raidz_col_t;
110789Sahrens 
111789Sahrens typedef struct raidz_map {
11210105Sadam.leventhal@sun.com 	uint64_t rm_cols;		/* Regular column count */
11310105Sadam.leventhal@sun.com 	uint64_t rm_scols;		/* Count including skipped columns */
1142082Seschrock 	uint64_t rm_bigcols;		/* Number of oversized columns */
1152082Seschrock 	uint64_t rm_asize;		/* Actual total I/O size */
1162082Seschrock 	uint64_t rm_missingdata;	/* Count of missing data devices */
1172082Seschrock 	uint64_t rm_missingparity;	/* Count of missing parity devices */
1182082Seschrock 	uint64_t rm_firstdatacol;	/* First data column/parity count */
119*10450Sadam.leventhal@sun.com 	uint64_t rm_nskip;		/* Skipped sectors for padding */
120*10450Sadam.leventhal@sun.com 	uint64_t rm_skipstart;	/* Column index of padding start */
1212082Seschrock 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
122789Sahrens } raidz_map_t;
123789Sahrens 
1242082Seschrock #define	VDEV_RAIDZ_P		0
1252082Seschrock #define	VDEV_RAIDZ_Q		1
12610105Sadam.leventhal@sun.com #define	VDEV_RAIDZ_R		2
12710105Sadam.leventhal@sun.com #define	VDEV_RAIDZ_MAXPARITY	3
1282082Seschrock 
12910105Sadam.leventhal@sun.com #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
13010105Sadam.leventhal@sun.com #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
1312082Seschrock 
13210105Sadam.leventhal@sun.com /*
13310105Sadam.leventhal@sun.com  * We provide a mechanism to perform the field multiplication operation on a
13410105Sadam.leventhal@sun.com  * 64-bit value all at once rather than a byte at a time. This works by
13510105Sadam.leventhal@sun.com  * creating a mask from the top bit in each byte and using that to
13610105Sadam.leventhal@sun.com  * conditionally apply the XOR of 0x1d.
13710105Sadam.leventhal@sun.com  */
13810105Sadam.leventhal@sun.com #define	VDEV_RAIDZ_64MUL_2(x, mask) \
13910105Sadam.leventhal@sun.com { \
14010105Sadam.leventhal@sun.com 	(mask) = (x) & 0x8080808080808080ULL; \
14110105Sadam.leventhal@sun.com 	(mask) = ((mask) << 1) - ((mask) >> 7); \
14210105Sadam.leventhal@sun.com 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
14310105Sadam.leventhal@sun.com 	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
14410105Sadam.leventhal@sun.com }
14510105Sadam.leventhal@sun.com 
14610105Sadam.leventhal@sun.com #define	VDEV_RAIDZ_64MUL_4(x, mask) \
14710105Sadam.leventhal@sun.com { \
14810105Sadam.leventhal@sun.com 	VDEV_RAIDZ_64MUL_2((x), mask); \
14910105Sadam.leventhal@sun.com 	VDEV_RAIDZ_64MUL_2((x), mask); \
15010105Sadam.leventhal@sun.com }
15110105Sadam.leventhal@sun.com 
15210105Sadam.leventhal@sun.com /*
15310105Sadam.leventhal@sun.com  * Force reconstruction to use the general purpose method.
15410105Sadam.leventhal@sun.com  */
15510105Sadam.leventhal@sun.com int vdev_raidz_default_to_general;
1562082Seschrock 
1572082Seschrock /*
1582082Seschrock  * These two tables represent powers and logs of 2 in the Galois field defined
1592082Seschrock  * above. These values were computed by repeatedly multiplying by 2 as above.
1602082Seschrock  */
1612082Seschrock static const uint8_t vdev_raidz_pow2[256] = {
1622082Seschrock 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
1632082Seschrock 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
1642082Seschrock 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
1652082Seschrock 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
1662082Seschrock 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
1672082Seschrock 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
1682082Seschrock 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
1692082Seschrock 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
1702082Seschrock 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
1712082Seschrock 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
1722082Seschrock 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
1732082Seschrock 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
1742082Seschrock 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
1752082Seschrock 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
1762082Seschrock 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
1772082Seschrock 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
1782082Seschrock 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
1792082Seschrock 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
1802082Seschrock 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
1812082Seschrock 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
1822082Seschrock 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
1832082Seschrock 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
1842082Seschrock 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
1852082Seschrock 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
1862082Seschrock 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
1872082Seschrock 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
1882082Seschrock 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
1892082Seschrock 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
1902082Seschrock 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
1912082Seschrock 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
1922082Seschrock 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
1932082Seschrock 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
1942082Seschrock };
1952082Seschrock static const uint8_t vdev_raidz_log2[256] = {
1962082Seschrock 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
1972082Seschrock 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
1982082Seschrock 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
1992082Seschrock 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
2002082Seschrock 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
2012082Seschrock 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
2022082Seschrock 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
2032082Seschrock 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
2042082Seschrock 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
2052082Seschrock 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
2062082Seschrock 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
2072082Seschrock 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
2082082Seschrock 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
2092082Seschrock 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
2102082Seschrock 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
2112082Seschrock 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
2122082Seschrock 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
2132082Seschrock 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
2142082Seschrock 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
2152082Seschrock 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
2162082Seschrock 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
2172082Seschrock 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
2182082Seschrock 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
2192082Seschrock 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
2202082Seschrock 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
2212082Seschrock 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
2222082Seschrock 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
2232082Seschrock 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
2242082Seschrock 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
2252082Seschrock 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
2262082Seschrock 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
2272082Seschrock 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
2282082Seschrock };
2292082Seschrock 
2302082Seschrock /*
2312082Seschrock  * Multiply a given number by 2 raised to the given power.
2322082Seschrock  */
2332082Seschrock static uint8_t
2342082Seschrock vdev_raidz_exp2(uint_t a, int exp)
2352082Seschrock {
2362082Seschrock 	if (a == 0)
2372082Seschrock 		return (0);
2382082Seschrock 
2392082Seschrock 	ASSERT(exp >= 0);
2402082Seschrock 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
2412082Seschrock 
2422082Seschrock 	exp += vdev_raidz_log2[a];
2432082Seschrock 	if (exp > 255)
2442082Seschrock 		exp -= 255;
2452082Seschrock 
2462082Seschrock 	return (vdev_raidz_pow2[exp]);
2472082Seschrock }
2482082Seschrock 
2497754SJeff.Bonwick@Sun.COM static void
2507754SJeff.Bonwick@Sun.COM vdev_raidz_map_free(zio_t *zio)
2517754SJeff.Bonwick@Sun.COM {
2527754SJeff.Bonwick@Sun.COM 	raidz_map_t *rm = zio->io_vsd;
2537754SJeff.Bonwick@Sun.COM 	int c;
2547754SJeff.Bonwick@Sun.COM 
2557754SJeff.Bonwick@Sun.COM 	for (c = 0; c < rm->rm_firstdatacol; c++)
2567754SJeff.Bonwick@Sun.COM 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
2577754SJeff.Bonwick@Sun.COM 
25810105Sadam.leventhal@sun.com 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
2597754SJeff.Bonwick@Sun.COM }
2607754SJeff.Bonwick@Sun.COM 
261789Sahrens static raidz_map_t *
2622082Seschrock vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
2632082Seschrock     uint64_t nparity)
264789Sahrens {
265789Sahrens 	raidz_map_t *rm;
266789Sahrens 	uint64_t b = zio->io_offset >> unit_shift;
267789Sahrens 	uint64_t s = zio->io_size >> unit_shift;
268789Sahrens 	uint64_t f = b % dcols;
269789Sahrens 	uint64_t o = (b / dcols) << unit_shift;
27010105Sadam.leventhal@sun.com 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
271789Sahrens 
2722082Seschrock 	q = s / (dcols - nparity);
2732082Seschrock 	r = s - q * (dcols - nparity);
2742082Seschrock 	bc = (r == 0 ? 0 : r + nparity);
27510105Sadam.leventhal@sun.com 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
276789Sahrens 
27710105Sadam.leventhal@sun.com 	if (q == 0) {
27810105Sadam.leventhal@sun.com 		acols = bc;
27910105Sadam.leventhal@sun.com 		scols = MIN(dcols, roundup(bc, nparity + 1));
28010105Sadam.leventhal@sun.com 	} else {
28110105Sadam.leventhal@sun.com 		acols = dcols;
28210105Sadam.leventhal@sun.com 		scols = dcols;
28310105Sadam.leventhal@sun.com 	}
284789Sahrens 
28510105Sadam.leventhal@sun.com 	ASSERT3U(acols, <=, scols);
28610105Sadam.leventhal@sun.com 
28710105Sadam.leventhal@sun.com 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
288789Sahrens 
289789Sahrens 	rm->rm_cols = acols;
29010105Sadam.leventhal@sun.com 	rm->rm_scols = scols;
291789Sahrens 	rm->rm_bigcols = bc;
292*10450Sadam.leventhal@sun.com 	rm->rm_skipstart = bc;
2932082Seschrock 	rm->rm_missingdata = 0;
2942082Seschrock 	rm->rm_missingparity = 0;
2952082Seschrock 	rm->rm_firstdatacol = nparity;
296789Sahrens 
29710105Sadam.leventhal@sun.com 	asize = 0;
29810105Sadam.leventhal@sun.com 
29910105Sadam.leventhal@sun.com 	for (c = 0; c < scols; c++) {
300789Sahrens 		col = f + c;
301789Sahrens 		coff = o;
302789Sahrens 		if (col >= dcols) {
303789Sahrens 			col -= dcols;
304789Sahrens 			coff += 1ULL << unit_shift;
305789Sahrens 		}
3062082Seschrock 		rm->rm_col[c].rc_devidx = col;
307789Sahrens 		rm->rm_col[c].rc_offset = coff;
308789Sahrens 		rm->rm_col[c].rc_data = NULL;
309789Sahrens 		rm->rm_col[c].rc_error = 0;
310789Sahrens 		rm->rm_col[c].rc_tried = 0;
311789Sahrens 		rm->rm_col[c].rc_skipped = 0;
31210105Sadam.leventhal@sun.com 
31310105Sadam.leventhal@sun.com 		if (c >= acols)
31410105Sadam.leventhal@sun.com 			rm->rm_col[c].rc_size = 0;
31510105Sadam.leventhal@sun.com 		else if (c < bc)
31610105Sadam.leventhal@sun.com 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
31710105Sadam.leventhal@sun.com 		else
31810105Sadam.leventhal@sun.com 			rm->rm_col[c].rc_size = q << unit_shift;
31910105Sadam.leventhal@sun.com 
32010105Sadam.leventhal@sun.com 		asize += rm->rm_col[c].rc_size;
321789Sahrens 	}
322789Sahrens 
32310105Sadam.leventhal@sun.com 	ASSERT3U(asize, ==, tot << unit_shift);
32410105Sadam.leventhal@sun.com 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
325*10450Sadam.leventhal@sun.com 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
326*10450Sadam.leventhal@sun.com 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
327*10450Sadam.leventhal@sun.com 	ASSERT3U(rm->rm_nskip, <=, nparity);
328789Sahrens 
329789Sahrens 	for (c = 0; c < rm->rm_firstdatacol; c++)
330789Sahrens 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
331789Sahrens 
332789Sahrens 	rm->rm_col[c].rc_data = zio->io_data;
333789Sahrens 
334789Sahrens 	for (c = c + 1; c < acols; c++)
335789Sahrens 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
336789Sahrens 		    rm->rm_col[c - 1].rc_size;
337789Sahrens 
3381133Seschrock 	/*
3392082Seschrock 	 * If all data stored spans all columns, there's a danger that parity
3402082Seschrock 	 * will always be on the same device and, since parity isn't read
3412082Seschrock 	 * during normal operation, that that device's I/O bandwidth won't be
3422082Seschrock 	 * used effectively. We therefore switch the parity every 1MB.
3432082Seschrock 	 *
3442082Seschrock 	 * ... at least that was, ostensibly, the theory. As a practical
3452082Seschrock 	 * matter unless we juggle the parity between all devices evenly, we
3462082Seschrock 	 * won't see any benefit. Further, occasional writes that aren't a
3472082Seschrock 	 * multiple of the LCM of the number of children and the minimum
3482082Seschrock 	 * stripe width are sufficient to avoid pessimal behavior.
3492082Seschrock 	 * Unfortunately, this decision created an implicit on-disk format
3503456Sahl 	 * requirement that we need to support for all eternity, but only
3513456Sahl 	 * for single-parity RAID-Z.
352*10450Sadam.leventhal@sun.com 	 *
353*10450Sadam.leventhal@sun.com 	 * If we intend to skip a sector in the zeroth column for padding
354*10450Sadam.leventhal@sun.com 	 * we must make sure to note this swap. We will never intend to
355*10450Sadam.leventhal@sun.com 	 * skip the first column since at least one data and one parity
356*10450Sadam.leventhal@sun.com 	 * column must appear in each row.
3571133Seschrock 	 */
3581133Seschrock 	ASSERT(rm->rm_cols >= 2);
3591133Seschrock 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
360789Sahrens 
3612082Seschrock 	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
3622082Seschrock 		devidx = rm->rm_col[0].rc_devidx;
3631133Seschrock 		o = rm->rm_col[0].rc_offset;
3642082Seschrock 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
3651133Seschrock 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
3662082Seschrock 		rm->rm_col[1].rc_devidx = devidx;
3671133Seschrock 		rm->rm_col[1].rc_offset = o;
368*10450Sadam.leventhal@sun.com 
369*10450Sadam.leventhal@sun.com 		if (rm->rm_skipstart == 0)
370*10450Sadam.leventhal@sun.com 			rm->rm_skipstart = 1;
371789Sahrens 	}
372789Sahrens 
373789Sahrens 	zio->io_vsd = rm;
3747754SJeff.Bonwick@Sun.COM 	zio->io_vsd_free = vdev_raidz_map_free;
375789Sahrens 	return (rm);
376789Sahrens }
377789Sahrens 
378789Sahrens static void
3792082Seschrock vdev_raidz_generate_parity_p(raidz_map_t *rm)
3802082Seschrock {
3812082Seschrock 	uint64_t *p, *src, pcount, ccount, i;
3822082Seschrock 	int c;
3832082Seschrock 
3842082Seschrock 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
3852082Seschrock 
3862082Seschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
3872082Seschrock 		src = rm->rm_col[c].rc_data;
3882082Seschrock 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
3892082Seschrock 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
3902082Seschrock 
3912082Seschrock 		if (c == rm->rm_firstdatacol) {
3922082Seschrock 			ASSERT(ccount == pcount);
39310105Sadam.leventhal@sun.com 			for (i = 0; i < ccount; i++, src++, p++) {
3942082Seschrock 				*p = *src;
3952082Seschrock 			}
3962082Seschrock 		} else {
3972082Seschrock 			ASSERT(ccount <= pcount);
39810105Sadam.leventhal@sun.com 			for (i = 0; i < ccount; i++, src++, p++) {
3992082Seschrock 				*p ^= *src;
4002082Seschrock 			}
4012082Seschrock 		}
4022082Seschrock 	}
4032082Seschrock }
4042082Seschrock 
4052082Seschrock static void
4062082Seschrock vdev_raidz_generate_parity_pq(raidz_map_t *rm)
407789Sahrens {
40810105Sadam.leventhal@sun.com 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
4092082Seschrock 	int c;
4102082Seschrock 
41110105Sadam.leventhal@sun.com 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
4122082Seschrock 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
4132082Seschrock 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
4142082Seschrock 
4152082Seschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
4162082Seschrock 		src = rm->rm_col[c].rc_data;
4172082Seschrock 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
4182082Seschrock 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
41910105Sadam.leventhal@sun.com 
42010105Sadam.leventhal@sun.com 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
4212082Seschrock 
4222082Seschrock 		if (c == rm->rm_firstdatacol) {
42310105Sadam.leventhal@sun.com 			ASSERT(ccnt == pcnt || ccnt == 0);
42410105Sadam.leventhal@sun.com 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
4252082Seschrock 				*p = *src;
42610105Sadam.leventhal@sun.com 				*q = *src;
4272082Seschrock 			}
42810105Sadam.leventhal@sun.com 			for (; i < pcnt; i++, src++, p++, q++) {
42910105Sadam.leventhal@sun.com 				*p = 0;
4302082Seschrock 				*q = 0;
4312082Seschrock 			}
4322082Seschrock 		} else {
43310105Sadam.leventhal@sun.com 			ASSERT(ccnt <= pcnt);
434789Sahrens 
4352082Seschrock 			/*
43610105Sadam.leventhal@sun.com 			 * Apply the algorithm described above by multiplying
43710105Sadam.leventhal@sun.com 			 * the previous result and adding in the new value.
4382082Seschrock 			 */
43910105Sadam.leventhal@sun.com 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
44010105Sadam.leventhal@sun.com 				*p ^= *src;
44110105Sadam.leventhal@sun.com 
44210105Sadam.leventhal@sun.com 				VDEV_RAIDZ_64MUL_2(*q, mask);
4432082Seschrock 				*q ^= *src;
4442082Seschrock 			}
4452082Seschrock 
4462082Seschrock 			/*
4472082Seschrock 			 * Treat short columns as though they are full of 0s.
44810105Sadam.leventhal@sun.com 			 * Note that there's therefore nothing needed for P.
4492082Seschrock 			 */
45010105Sadam.leventhal@sun.com 			for (; i < pcnt; i++, q++) {
45110105Sadam.leventhal@sun.com 				VDEV_RAIDZ_64MUL_2(*q, mask);
4522082Seschrock 			}
4532082Seschrock 		}
4542082Seschrock 	}
4552082Seschrock }
4562082Seschrock 
4572082Seschrock static void
45810105Sadam.leventhal@sun.com vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
45910105Sadam.leventhal@sun.com {
46010105Sadam.leventhal@sun.com 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
46110105Sadam.leventhal@sun.com 	int c;
46210105Sadam.leventhal@sun.com 
46310105Sadam.leventhal@sun.com 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
46410105Sadam.leventhal@sun.com 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
46510105Sadam.leventhal@sun.com 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
46610105Sadam.leventhal@sun.com 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
46710105Sadam.leventhal@sun.com 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
46810105Sadam.leventhal@sun.com 
46910105Sadam.leventhal@sun.com 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
47010105Sadam.leventhal@sun.com 		src = rm->rm_col[c].rc_data;
47110105Sadam.leventhal@sun.com 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
47210105Sadam.leventhal@sun.com 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
47310105Sadam.leventhal@sun.com 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
47410105Sadam.leventhal@sun.com 
47510105Sadam.leventhal@sun.com 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
47610105Sadam.leventhal@sun.com 
47710105Sadam.leventhal@sun.com 		if (c == rm->rm_firstdatacol) {
47810105Sadam.leventhal@sun.com 			ASSERT(ccnt == pcnt || ccnt == 0);
47910105Sadam.leventhal@sun.com 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
48010105Sadam.leventhal@sun.com 				*p = *src;
48110105Sadam.leventhal@sun.com 				*q = *src;
48210105Sadam.leventhal@sun.com 				*r = *src;
48310105Sadam.leventhal@sun.com 			}
48410105Sadam.leventhal@sun.com 			for (; i < pcnt; i++, src++, p++, q++, r++) {
48510105Sadam.leventhal@sun.com 				*p = 0;
48610105Sadam.leventhal@sun.com 				*q = 0;
48710105Sadam.leventhal@sun.com 				*r = 0;
48810105Sadam.leventhal@sun.com 			}
48910105Sadam.leventhal@sun.com 		} else {
49010105Sadam.leventhal@sun.com 			ASSERT(ccnt <= pcnt);
49110105Sadam.leventhal@sun.com 
49210105Sadam.leventhal@sun.com 			/*
49310105Sadam.leventhal@sun.com 			 * Apply the algorithm described above by multiplying
49410105Sadam.leventhal@sun.com 			 * the previous result and adding in the new value.
49510105Sadam.leventhal@sun.com 			 */
49610105Sadam.leventhal@sun.com 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
49710105Sadam.leventhal@sun.com 				*p ^= *src;
49810105Sadam.leventhal@sun.com 
49910105Sadam.leventhal@sun.com 				VDEV_RAIDZ_64MUL_2(*q, mask);
50010105Sadam.leventhal@sun.com 				*q ^= *src;
50110105Sadam.leventhal@sun.com 
50210105Sadam.leventhal@sun.com 				VDEV_RAIDZ_64MUL_4(*r, mask);
50310105Sadam.leventhal@sun.com 				*r ^= *src;
50410105Sadam.leventhal@sun.com 			}
50510105Sadam.leventhal@sun.com 
50610105Sadam.leventhal@sun.com 			/*
50710105Sadam.leventhal@sun.com 			 * Treat short columns as though they are full of 0s.
50810105Sadam.leventhal@sun.com 			 * Note that there's therefore nothing needed for P.
50910105Sadam.leventhal@sun.com 			 */
51010105Sadam.leventhal@sun.com 			for (; i < pcnt; i++, q++, r++) {
51110105Sadam.leventhal@sun.com 				VDEV_RAIDZ_64MUL_2(*q, mask);
51210105Sadam.leventhal@sun.com 				VDEV_RAIDZ_64MUL_4(*r, mask);
51310105Sadam.leventhal@sun.com 			}
51410105Sadam.leventhal@sun.com 		}
51510105Sadam.leventhal@sun.com 	}
51610105Sadam.leventhal@sun.com }
51710105Sadam.leventhal@sun.com 
51810105Sadam.leventhal@sun.com /*
51910105Sadam.leventhal@sun.com  * Generate RAID parity in the first virtual columns according to the number of
52010105Sadam.leventhal@sun.com  * parity columns available.
52110105Sadam.leventhal@sun.com  */
52210105Sadam.leventhal@sun.com static void
52310105Sadam.leventhal@sun.com vdev_raidz_generate_parity(raidz_map_t *rm)
52410105Sadam.leventhal@sun.com {
52510105Sadam.leventhal@sun.com 	switch (rm->rm_firstdatacol) {
52610105Sadam.leventhal@sun.com 	case 1:
52710105Sadam.leventhal@sun.com 		vdev_raidz_generate_parity_p(rm);
52810105Sadam.leventhal@sun.com 		break;
52910105Sadam.leventhal@sun.com 	case 2:
53010105Sadam.leventhal@sun.com 		vdev_raidz_generate_parity_pq(rm);
53110105Sadam.leventhal@sun.com 		break;
53210105Sadam.leventhal@sun.com 	case 3:
53310105Sadam.leventhal@sun.com 		vdev_raidz_generate_parity_pqr(rm);
53410105Sadam.leventhal@sun.com 		break;
53510105Sadam.leventhal@sun.com 	default:
53610105Sadam.leventhal@sun.com 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
53710105Sadam.leventhal@sun.com 	}
53810105Sadam.leventhal@sun.com }
53910105Sadam.leventhal@sun.com 
54010105Sadam.leventhal@sun.com static int
54110105Sadam.leventhal@sun.com vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
5422082Seschrock {
5432082Seschrock 	uint64_t *dst, *src, xcount, ccount, count, i;
54410105Sadam.leventhal@sun.com 	int x = tgts[0];
5452082Seschrock 	int c;
5462082Seschrock 
54710105Sadam.leventhal@sun.com 	ASSERT(ntgts == 1);
54810105Sadam.leventhal@sun.com 	ASSERT(x >= rm->rm_firstdatacol);
54910105Sadam.leventhal@sun.com 	ASSERT(x < rm->rm_cols);
55010105Sadam.leventhal@sun.com 
5512082Seschrock 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
5522082Seschrock 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
5532082Seschrock 	ASSERT(xcount > 0);
5542082Seschrock 
5552082Seschrock 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
5562082Seschrock 	dst = rm->rm_col[x].rc_data;
5572082Seschrock 	for (i = 0; i < xcount; i++, dst++, src++) {
5582082Seschrock 		*dst = *src;
5592082Seschrock 	}
5602082Seschrock 
5612082Seschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
562789Sahrens 		src = rm->rm_col[c].rc_data;
563789Sahrens 		dst = rm->rm_col[x].rc_data;
5642082Seschrock 
5652082Seschrock 		if (c == x)
5662082Seschrock 			continue;
5672082Seschrock 
5682082Seschrock 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
5692082Seschrock 		count = MIN(ccount, xcount);
5702082Seschrock 
5712082Seschrock 		for (i = 0; i < count; i++, dst++, src++) {
5722082Seschrock 			*dst ^= *src;
573789Sahrens 		}
574789Sahrens 	}
57510105Sadam.leventhal@sun.com 
57610105Sadam.leventhal@sun.com 	return (1 << VDEV_RAIDZ_P);
577789Sahrens }
578789Sahrens 
57910105Sadam.leventhal@sun.com static int
58010105Sadam.leventhal@sun.com vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
5812082Seschrock {
5822082Seschrock 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
5832082Seschrock 	uint8_t *b;
58410105Sadam.leventhal@sun.com 	int x = tgts[0];
5852082Seschrock 	int c, j, exp;
5862082Seschrock 
58710105Sadam.leventhal@sun.com 	ASSERT(ntgts == 1);
58810105Sadam.leventhal@sun.com 
5892082Seschrock 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
5902082Seschrock 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
5912082Seschrock 
5922082Seschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
5932082Seschrock 		src = rm->rm_col[c].rc_data;
5942082Seschrock 		dst = rm->rm_col[x].rc_data;
5952082Seschrock 
5962082Seschrock 		if (c == x)
5972082Seschrock 			ccount = 0;
5982082Seschrock 		else
5992082Seschrock 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
6002082Seschrock 
6012082Seschrock 		count = MIN(ccount, xcount);
6022082Seschrock 
6032082Seschrock 		if (c == rm->rm_firstdatacol) {
6042082Seschrock 			for (i = 0; i < count; i++, dst++, src++) {
6052082Seschrock 				*dst = *src;
6062082Seschrock 			}
6072082Seschrock 			for (; i < xcount; i++, dst++) {
6082082Seschrock 				*dst = 0;
6092082Seschrock 			}
6102082Seschrock 
6112082Seschrock 		} else {
6122082Seschrock 			for (i = 0; i < count; i++, dst++, src++) {
61310105Sadam.leventhal@sun.com 				VDEV_RAIDZ_64MUL_2(*dst, mask);
6142082Seschrock 				*dst ^= *src;
6152082Seschrock 			}
6162082Seschrock 
6172082Seschrock 			for (; i < xcount; i++, dst++) {
61810105Sadam.leventhal@sun.com 				VDEV_RAIDZ_64MUL_2(*dst, mask);
6192082Seschrock 			}
6202082Seschrock 		}
6212082Seschrock 	}
6222082Seschrock 
6232082Seschrock 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
6242082Seschrock 	dst = rm->rm_col[x].rc_data;
6252082Seschrock 	exp = 255 - (rm->rm_cols - 1 - x);
6262082Seschrock 
6272082Seschrock 	for (i = 0; i < xcount; i++, dst++, src++) {
6282082Seschrock 		*dst ^= *src;
6292082Seschrock 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
6302082Seschrock 			*b = vdev_raidz_exp2(*b, exp);
6312082Seschrock 		}
6322082Seschrock 	}
63310105Sadam.leventhal@sun.com 
63410105Sadam.leventhal@sun.com 	return (1 << VDEV_RAIDZ_Q);
6352082Seschrock }
6362082Seschrock 
63710105Sadam.leventhal@sun.com static int
63810105Sadam.leventhal@sun.com vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
6392082Seschrock {
6402082Seschrock 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
6412082Seschrock 	void *pdata, *qdata;
6422082Seschrock 	uint64_t xsize, ysize, i;
64310105Sadam.leventhal@sun.com 	int x = tgts[0];
64410105Sadam.leventhal@sun.com 	int y = tgts[1];
6452082Seschrock 
64610105Sadam.leventhal@sun.com 	ASSERT(ntgts == 2);
6472082Seschrock 	ASSERT(x < y);
6482082Seschrock 	ASSERT(x >= rm->rm_firstdatacol);
6492082Seschrock 	ASSERT(y < rm->rm_cols);
6502082Seschrock 
6512082Seschrock 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
6522082Seschrock 
6532082Seschrock 	/*
6542082Seschrock 	 * Move the parity data aside -- we're going to compute parity as
6552082Seschrock 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
6562082Seschrock 	 * reuse the parity generation mechanism without trashing the actual
6572082Seschrock 	 * parity so we make those columns appear to be full of zeros by
6582082Seschrock 	 * setting their lengths to zero.
6592082Seschrock 	 */
6602082Seschrock 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
6612082Seschrock 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
6622082Seschrock 	xsize = rm->rm_col[x].rc_size;
6632082Seschrock 	ysize = rm->rm_col[y].rc_size;
6642082Seschrock 
6652082Seschrock 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
6662082Seschrock 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
6672082Seschrock 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
6682082Seschrock 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
6692082Seschrock 	rm->rm_col[x].rc_size = 0;
6702082Seschrock 	rm->rm_col[y].rc_size = 0;
6712082Seschrock 
6722082Seschrock 	vdev_raidz_generate_parity_pq(rm);
6732082Seschrock 
6742082Seschrock 	rm->rm_col[x].rc_size = xsize;
6752082Seschrock 	rm->rm_col[y].rc_size = ysize;
6762082Seschrock 
6772082Seschrock 	p = pdata;
6782082Seschrock 	q = qdata;
6792082Seschrock 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
6802082Seschrock 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
6812082Seschrock 	xd = rm->rm_col[x].rc_data;
6822082Seschrock 	yd = rm->rm_col[y].rc_data;
6832082Seschrock 
6842082Seschrock 	/*
6852082Seschrock 	 * We now have:
6862082Seschrock 	 *	Pxy = P + D_x + D_y
6872082Seschrock 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
6882082Seschrock 	 *
6892082Seschrock 	 * We can then solve for D_x:
6902082Seschrock 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
6912082Seschrock 	 * where
6922082Seschrock 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
6932082Seschrock 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
6942082Seschrock 	 *
6952082Seschrock 	 * With D_x in hand, we can easily solve for D_y:
6962082Seschrock 	 *	D_y = P + Pxy + D_x
6972082Seschrock 	 */
6982082Seschrock 
6992082Seschrock 	a = vdev_raidz_pow2[255 + x - y];
7002082Seschrock 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
7012082Seschrock 	tmp = 255 - vdev_raidz_log2[a ^ 1];
7022082Seschrock 
7032082Seschrock 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
7042082Seschrock 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
7052082Seschrock 
7062082Seschrock 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
7072082Seschrock 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
7082082Seschrock 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
7092082Seschrock 
7102082Seschrock 		if (i < ysize)
7112082Seschrock 			*yd = *p ^ *pxy ^ *xd;
7122082Seschrock 	}
7132082Seschrock 
7142082Seschrock 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
7152082Seschrock 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
7162082Seschrock 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
7172082Seschrock 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
7182082Seschrock 
7192082Seschrock 	/*
7202082Seschrock 	 * Restore the saved parity data.
7212082Seschrock 	 */
7222082Seschrock 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
7232082Seschrock 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
72410105Sadam.leventhal@sun.com 
72510105Sadam.leventhal@sun.com 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
72610105Sadam.leventhal@sun.com }
72710105Sadam.leventhal@sun.com 
72810105Sadam.leventhal@sun.com /* BEGIN CSTYLED */
72910105Sadam.leventhal@sun.com /*
73010105Sadam.leventhal@sun.com  * In the general case of reconstruction, we must solve the system of linear
73110105Sadam.leventhal@sun.com  * equations defined by the coeffecients used to generate parity as well as
73210105Sadam.leventhal@sun.com  * the contents of the data and parity disks. This can be expressed with
73310105Sadam.leventhal@sun.com  * vectors for the original data (D) and the actual data (d) and parity (p)
73410105Sadam.leventhal@sun.com  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
73510105Sadam.leventhal@sun.com  *
73610105Sadam.leventhal@sun.com  *            __   __                     __     __
73710105Sadam.leventhal@sun.com  *            |     |         __     __   |  p_0  |
73810105Sadam.leventhal@sun.com  *            |  V  |         |  D_0  |   | p_m-1 |
73910105Sadam.leventhal@sun.com  *            |     |    x    |   :   | = |  d_0  |
74010105Sadam.leventhal@sun.com  *            |  I  |         | D_n-1 |   |   :   |
74110105Sadam.leventhal@sun.com  *            |     |         ~~     ~~   | d_n-1 |
74210105Sadam.leventhal@sun.com  *            ~~   ~~                     ~~     ~~
74310105Sadam.leventhal@sun.com  *
74410105Sadam.leventhal@sun.com  * I is simply a square identity matrix of size n, and V is a vandermonde
74510105Sadam.leventhal@sun.com  * matrix defined by the coeffecients we chose for the various parity columns
74610105Sadam.leventhal@sun.com  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
74710105Sadam.leventhal@sun.com  * computation as well as linear separability.
74810105Sadam.leventhal@sun.com  *
74910105Sadam.leventhal@sun.com  *      __               __               __     __
75010105Sadam.leventhal@sun.com  *      |   1   ..  1 1 1 |               |  p_0  |
75110105Sadam.leventhal@sun.com  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
75210105Sadam.leventhal@sun.com  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
75310105Sadam.leventhal@sun.com  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
75410105Sadam.leventhal@sun.com  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
75510105Sadam.leventhal@sun.com  *      |   :       : : : |   |   :   |   |  d_2  |
75610105Sadam.leventhal@sun.com  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
75710105Sadam.leventhal@sun.com  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
75810105Sadam.leventhal@sun.com  *      |   0   ..  0 0 1 |               | d_n-1 |
75910105Sadam.leventhal@sun.com  *      ~~               ~~               ~~     ~~
76010105Sadam.leventhal@sun.com  *
76110105Sadam.leventhal@sun.com  * Note that I, V, d, and p are known. To compute D, we must invert the
76210105Sadam.leventhal@sun.com  * matrix and use the known data and parity values to reconstruct the unknown
76310105Sadam.leventhal@sun.com  * data values. We begin by removing the rows in V|I and d|p that correspond
76410105Sadam.leventhal@sun.com  * to failed or missing columns; we then make V|I square (n x n) and d|p
76510105Sadam.leventhal@sun.com  * sized n by removing rows corresponding to unused parity from the bottom up
76610105Sadam.leventhal@sun.com  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
76710105Sadam.leventhal@sun.com  * using Gauss-Jordan elimination. In the example below we use m=3 parity
76810105Sadam.leventhal@sun.com  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
76910105Sadam.leventhal@sun.com  *           __                               __
77010105Sadam.leventhal@sun.com  *           |  1   1   1   1   1   1   1   1  |
77110105Sadam.leventhal@sun.com  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
77210105Sadam.leventhal@sun.com  *           |  19 205 116  29  64  16  4   1  |      / /
77310105Sadam.leventhal@sun.com  *           |  1   0   0   0   0   0   0   0  |     / /
77410105Sadam.leventhal@sun.com  *           |  0   1   0   0   0   0   0   0  | <--' /
77510105Sadam.leventhal@sun.com  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
77610105Sadam.leventhal@sun.com  *           |  0   0   0   1   0   0   0   0  |
77710105Sadam.leventhal@sun.com  *           |  0   0   0   0   1   0   0   0  |
77810105Sadam.leventhal@sun.com  *           |  0   0   0   0   0   1   0   0  |
77910105Sadam.leventhal@sun.com  *           |  0   0   0   0   0   0   1   0  |
78010105Sadam.leventhal@sun.com  *           |  0   0   0   0   0   0   0   1  |
78110105Sadam.leventhal@sun.com  *           ~~                               ~~
78210105Sadam.leventhal@sun.com  *           __                               __
78310105Sadam.leventhal@sun.com  *           |  1   1   1   1   1   1   1   1  |
78410105Sadam.leventhal@sun.com  *           | 128  64  32  16  8   4   2   1  |
78510105Sadam.leventhal@sun.com  *           |  19 205 116  29  64  16  4   1  |
78610105Sadam.leventhal@sun.com  *           |  1   0   0   0   0   0   0   0  |
78710105Sadam.leventhal@sun.com  *           |  0   1   0   0   0   0   0   0  |
78810105Sadam.leventhal@sun.com  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
78910105Sadam.leventhal@sun.com  *           |  0   0   0   1   0   0   0   0  |
79010105Sadam.leventhal@sun.com  *           |  0   0   0   0   1   0   0   0  |
79110105Sadam.leventhal@sun.com  *           |  0   0   0   0   0   1   0   0  |
79210105Sadam.leventhal@sun.com  *           |  0   0   0   0   0   0   1   0  |
79310105Sadam.leventhal@sun.com  *           |  0   0   0   0   0   0   0   1  |
79410105Sadam.leventhal@sun.com  *           ~~                               ~~
79510105Sadam.leventhal@sun.com  *
79610105Sadam.leventhal@sun.com  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
79710105Sadam.leventhal@sun.com  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
79810105Sadam.leventhal@sun.com  * matrix is not singular.
79910105Sadam.leventhal@sun.com  * __                                                                 __
80010105Sadam.leventhal@sun.com  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
80110105Sadam.leventhal@sun.com  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
80210105Sadam.leventhal@sun.com  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
80310105Sadam.leventhal@sun.com  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
80410105Sadam.leventhal@sun.com  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
80510105Sadam.leventhal@sun.com  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
80610105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
80710105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
80810105Sadam.leventhal@sun.com  * ~~                                                                 ~~
80910105Sadam.leventhal@sun.com  * __                                                                 __
81010105Sadam.leventhal@sun.com  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
81110105Sadam.leventhal@sun.com  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
81210105Sadam.leventhal@sun.com  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
81310105Sadam.leventhal@sun.com  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
81410105Sadam.leventhal@sun.com  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
81510105Sadam.leventhal@sun.com  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
81610105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
81710105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
81810105Sadam.leventhal@sun.com  * ~~                                                                 ~~
81910105Sadam.leventhal@sun.com  * __                                                                 __
82010105Sadam.leventhal@sun.com  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
82110105Sadam.leventhal@sun.com  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
82210105Sadam.leventhal@sun.com  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
82310105Sadam.leventhal@sun.com  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
82410105Sadam.leventhal@sun.com  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
82510105Sadam.leventhal@sun.com  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
82610105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
82710105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
82810105Sadam.leventhal@sun.com  * ~~                                                                 ~~
82910105Sadam.leventhal@sun.com  * __                                                                 __
83010105Sadam.leventhal@sun.com  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
83110105Sadam.leventhal@sun.com  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
83210105Sadam.leventhal@sun.com  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
83310105Sadam.leventhal@sun.com  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
83410105Sadam.leventhal@sun.com  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
83510105Sadam.leventhal@sun.com  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
83610105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
83710105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
83810105Sadam.leventhal@sun.com  * ~~                                                                 ~~
83910105Sadam.leventhal@sun.com  * __                                                                 __
84010105Sadam.leventhal@sun.com  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
84110105Sadam.leventhal@sun.com  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
84210105Sadam.leventhal@sun.com  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
84310105Sadam.leventhal@sun.com  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
84410105Sadam.leventhal@sun.com  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
84510105Sadam.leventhal@sun.com  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
84610105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
84710105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
84810105Sadam.leventhal@sun.com  * ~~                                                                 ~~
84910105Sadam.leventhal@sun.com  * __                                                                 __
85010105Sadam.leventhal@sun.com  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
85110105Sadam.leventhal@sun.com  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
85210105Sadam.leventhal@sun.com  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
85310105Sadam.leventhal@sun.com  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
85410105Sadam.leventhal@sun.com  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
85510105Sadam.leventhal@sun.com  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
85610105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
85710105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
85810105Sadam.leventhal@sun.com  * ~~                                                                 ~~
85910105Sadam.leventhal@sun.com  *                   __                               __
86010105Sadam.leventhal@sun.com  *                   |  0   0   1   0   0   0   0   0  |
86110105Sadam.leventhal@sun.com  *                   | 167 100  5   41 159 169 217 208 |
86210105Sadam.leventhal@sun.com  *                   | 166 100  4   40 158 168 216 209 |
86310105Sadam.leventhal@sun.com  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
86410105Sadam.leventhal@sun.com  *                   |  0   0   0   0   1   0   0   0  |
86510105Sadam.leventhal@sun.com  *                   |  0   0   0   0   0   1   0   0  |
86610105Sadam.leventhal@sun.com  *                   |  0   0   0   0   0   0   1   0  |
86710105Sadam.leventhal@sun.com  *                   |  0   0   0   0   0   0   0   1  |
86810105Sadam.leventhal@sun.com  *                   ~~                               ~~
86910105Sadam.leventhal@sun.com  *
87010105Sadam.leventhal@sun.com  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
87110105Sadam.leventhal@sun.com  * of the missing data.
87210105Sadam.leventhal@sun.com  *
87310105Sadam.leventhal@sun.com  * As is apparent from the example above, the only non-trivial rows in the
87410105Sadam.leventhal@sun.com  * inverse matrix correspond to the data disks that we're trying to
87510105Sadam.leventhal@sun.com  * reconstruct. Indeed, those are the only rows we need as the others would
87610105Sadam.leventhal@sun.com  * only be useful for reconstructing data known or assumed to be valid. For
87710105Sadam.leventhal@sun.com  * that reason, we only build the coefficients in the rows that correspond to
87810105Sadam.leventhal@sun.com  * targeted columns.
87910105Sadam.leventhal@sun.com  */
88010105Sadam.leventhal@sun.com /* END CSTYLED */
88110105Sadam.leventhal@sun.com 
88210105Sadam.leventhal@sun.com static void
88310105Sadam.leventhal@sun.com vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
88410105Sadam.leventhal@sun.com     uint8_t **rows)
88510105Sadam.leventhal@sun.com {
88610105Sadam.leventhal@sun.com 	int i, j;
88710105Sadam.leventhal@sun.com 	int pow;
88810105Sadam.leventhal@sun.com 
88910105Sadam.leventhal@sun.com 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
89010105Sadam.leventhal@sun.com 
89110105Sadam.leventhal@sun.com 	/*
89210105Sadam.leventhal@sun.com 	 * Fill in the missing rows of interest.
89310105Sadam.leventhal@sun.com 	 */
89410105Sadam.leventhal@sun.com 	for (i = 0; i < nmap; i++) {
89510105Sadam.leventhal@sun.com 		ASSERT3S(0, <=, map[i]);
89610105Sadam.leventhal@sun.com 		ASSERT3S(map[i], <=, 2);
89710105Sadam.leventhal@sun.com 
89810105Sadam.leventhal@sun.com 		pow = map[i] * n;
89910105Sadam.leventhal@sun.com 		if (pow > 255)
90010105Sadam.leventhal@sun.com 			pow -= 255;
90110105Sadam.leventhal@sun.com 		ASSERT(pow <= 255);
90210105Sadam.leventhal@sun.com 
90310105Sadam.leventhal@sun.com 		for (j = 0; j < n; j++) {
90410105Sadam.leventhal@sun.com 			pow -= map[i];
90510105Sadam.leventhal@sun.com 			if (pow < 0)
90610105Sadam.leventhal@sun.com 				pow += 255;
90710105Sadam.leventhal@sun.com 			rows[i][j] = vdev_raidz_pow2[pow];
90810105Sadam.leventhal@sun.com 		}
90910105Sadam.leventhal@sun.com 	}
9102082Seschrock }
9112082Seschrock 
91210105Sadam.leventhal@sun.com static void
91310105Sadam.leventhal@sun.com vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
91410105Sadam.leventhal@sun.com     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
91510105Sadam.leventhal@sun.com {
91610105Sadam.leventhal@sun.com 	int i, j, ii, jj;
91710105Sadam.leventhal@sun.com 	uint8_t log;
91810105Sadam.leventhal@sun.com 
91910105Sadam.leventhal@sun.com 	/*
92010105Sadam.leventhal@sun.com 	 * Assert that the first nmissing entries from the array of used
92110105Sadam.leventhal@sun.com 	 * columns correspond to parity columns and that subsequent entries
92210105Sadam.leventhal@sun.com 	 * correspond to data columns.
92310105Sadam.leventhal@sun.com 	 */
92410105Sadam.leventhal@sun.com 	for (i = 0; i < nmissing; i++) {
92510105Sadam.leventhal@sun.com 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
92610105Sadam.leventhal@sun.com 	}
92710105Sadam.leventhal@sun.com 	for (; i < n; i++) {
92810105Sadam.leventhal@sun.com 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
92910105Sadam.leventhal@sun.com 	}
93010105Sadam.leventhal@sun.com 
93110105Sadam.leventhal@sun.com 	/*
93210105Sadam.leventhal@sun.com 	 * First initialize the storage where we'll compute the inverse rows.
93310105Sadam.leventhal@sun.com 	 */
93410105Sadam.leventhal@sun.com 	for (i = 0; i < nmissing; i++) {
93510105Sadam.leventhal@sun.com 		for (j = 0; j < n; j++) {
93610105Sadam.leventhal@sun.com 			invrows[i][j] = (i == j) ? 1 : 0;
93710105Sadam.leventhal@sun.com 		}
93810105Sadam.leventhal@sun.com 	}
93910105Sadam.leventhal@sun.com 
94010105Sadam.leventhal@sun.com 	/*
94110105Sadam.leventhal@sun.com 	 * Subtract all trivial rows from the rows of consequence.
94210105Sadam.leventhal@sun.com 	 */
94310105Sadam.leventhal@sun.com 	for (i = 0; i < nmissing; i++) {
94410105Sadam.leventhal@sun.com 		for (j = nmissing; j < n; j++) {
94510105Sadam.leventhal@sun.com 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
94610105Sadam.leventhal@sun.com 			jj = used[j] - rm->rm_firstdatacol;
94710105Sadam.leventhal@sun.com 			ASSERT3S(jj, <, n);
94810105Sadam.leventhal@sun.com 			invrows[i][j] = rows[i][jj];
94910105Sadam.leventhal@sun.com 			rows[i][jj] = 0;
95010105Sadam.leventhal@sun.com 		}
95110105Sadam.leventhal@sun.com 	}
95210105Sadam.leventhal@sun.com 
95310105Sadam.leventhal@sun.com 	/*
95410105Sadam.leventhal@sun.com 	 * For each of the rows of interest, we must normalize it and subtract
95510105Sadam.leventhal@sun.com 	 * a multiple of it from the other rows.
95610105Sadam.leventhal@sun.com 	 */
95710105Sadam.leventhal@sun.com 	for (i = 0; i < nmissing; i++) {
95810105Sadam.leventhal@sun.com 		for (j = 0; j < missing[i]; j++) {
95910105Sadam.leventhal@sun.com 			ASSERT3U(rows[i][j], ==, 0);
96010105Sadam.leventhal@sun.com 		}
96110105Sadam.leventhal@sun.com 		ASSERT3U(rows[i][missing[i]], !=, 0);
96210105Sadam.leventhal@sun.com 
96310105Sadam.leventhal@sun.com 		/*
96410105Sadam.leventhal@sun.com 		 * Compute the inverse of the first element and multiply each
96510105Sadam.leventhal@sun.com 		 * element in the row by that value.
96610105Sadam.leventhal@sun.com 		 */
96710105Sadam.leventhal@sun.com 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
96810105Sadam.leventhal@sun.com 
96910105Sadam.leventhal@sun.com 		for (j = 0; j < n; j++) {
97010105Sadam.leventhal@sun.com 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
97110105Sadam.leventhal@sun.com 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
97210105Sadam.leventhal@sun.com 		}
97310105Sadam.leventhal@sun.com 
97410105Sadam.leventhal@sun.com 		for (ii = 0; ii < nmissing; ii++) {
97510105Sadam.leventhal@sun.com 			if (i == ii)
97610105Sadam.leventhal@sun.com 				continue;
97710105Sadam.leventhal@sun.com 
97810105Sadam.leventhal@sun.com 			ASSERT3U(rows[ii][missing[i]], !=, 0);
97910105Sadam.leventhal@sun.com 
98010105Sadam.leventhal@sun.com 			log = vdev_raidz_log2[rows[ii][missing[i]]];
98110105Sadam.leventhal@sun.com 
98210105Sadam.leventhal@sun.com 			for (j = 0; j < n; j++) {
98310105Sadam.leventhal@sun.com 				rows[ii][j] ^=
98410105Sadam.leventhal@sun.com 				    vdev_raidz_exp2(rows[i][j], log);
98510105Sadam.leventhal@sun.com 				invrows[ii][j] ^=
98610105Sadam.leventhal@sun.com 				    vdev_raidz_exp2(invrows[i][j], log);
98710105Sadam.leventhal@sun.com 			}
98810105Sadam.leventhal@sun.com 		}
98910105Sadam.leventhal@sun.com 	}
99010105Sadam.leventhal@sun.com 
99110105Sadam.leventhal@sun.com 	/*
99210105Sadam.leventhal@sun.com 	 * Verify that the data that is left in the rows are properly part of
99310105Sadam.leventhal@sun.com 	 * an identity matrix.
99410105Sadam.leventhal@sun.com 	 */
99510105Sadam.leventhal@sun.com 	for (i = 0; i < nmissing; i++) {
99610105Sadam.leventhal@sun.com 		for (j = 0; j < n; j++) {
99710105Sadam.leventhal@sun.com 			if (j == missing[i]) {
99810105Sadam.leventhal@sun.com 				ASSERT3U(rows[i][j], ==, 1);
99910105Sadam.leventhal@sun.com 			} else {
100010105Sadam.leventhal@sun.com 				ASSERT3U(rows[i][j], ==, 0);
100110105Sadam.leventhal@sun.com 			}
100210105Sadam.leventhal@sun.com 		}
100310105Sadam.leventhal@sun.com 	}
100410105Sadam.leventhal@sun.com }
100510105Sadam.leventhal@sun.com 
100610105Sadam.leventhal@sun.com static void
100710105Sadam.leventhal@sun.com vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
100810105Sadam.leventhal@sun.com     int *missing, uint8_t **invrows, const uint8_t *used)
100910105Sadam.leventhal@sun.com {
101010105Sadam.leventhal@sun.com 	int i, j, x, cc, c;
101110105Sadam.leventhal@sun.com 	uint8_t *src;
101210105Sadam.leventhal@sun.com 	uint64_t ccount;
101310105Sadam.leventhal@sun.com 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
101410105Sadam.leventhal@sun.com 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
101510105Sadam.leventhal@sun.com 	uint8_t log, val;
101610105Sadam.leventhal@sun.com 	int ll;
101710105Sadam.leventhal@sun.com 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
101810105Sadam.leventhal@sun.com 	uint8_t *p, *pp;
101910105Sadam.leventhal@sun.com 	size_t psize;
102010105Sadam.leventhal@sun.com 
102110105Sadam.leventhal@sun.com 	psize = sizeof (invlog[0][0]) * n * nmissing;
102210105Sadam.leventhal@sun.com 	p = kmem_alloc(psize, KM_SLEEP);
102310105Sadam.leventhal@sun.com 
102410105Sadam.leventhal@sun.com 	for (pp = p, i = 0; i < nmissing; i++) {
102510105Sadam.leventhal@sun.com 		invlog[i] = pp;
102610105Sadam.leventhal@sun.com 		pp += n;
102710105Sadam.leventhal@sun.com 	}
102810105Sadam.leventhal@sun.com 
102910105Sadam.leventhal@sun.com 	for (i = 0; i < nmissing; i++) {
103010105Sadam.leventhal@sun.com 		for (j = 0; j < n; j++) {
103110105Sadam.leventhal@sun.com 			ASSERT3U(invrows[i][j], !=, 0);
103210105Sadam.leventhal@sun.com 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
103310105Sadam.leventhal@sun.com 		}
103410105Sadam.leventhal@sun.com 	}
103510105Sadam.leventhal@sun.com 
103610105Sadam.leventhal@sun.com 	for (i = 0; i < n; i++) {
103710105Sadam.leventhal@sun.com 		c = used[i];
103810105Sadam.leventhal@sun.com 		ASSERT3U(c, <, rm->rm_cols);
103910105Sadam.leventhal@sun.com 
104010105Sadam.leventhal@sun.com 		src = rm->rm_col[c].rc_data;
104110105Sadam.leventhal@sun.com 		ccount = rm->rm_col[c].rc_size;
104210105Sadam.leventhal@sun.com 		for (j = 0; j < nmissing; j++) {
104310105Sadam.leventhal@sun.com 			cc = missing[j] + rm->rm_firstdatacol;
104410105Sadam.leventhal@sun.com 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
104510105Sadam.leventhal@sun.com 			ASSERT3U(cc, <, rm->rm_cols);
104610105Sadam.leventhal@sun.com 			ASSERT3U(cc, !=, c);
104710105Sadam.leventhal@sun.com 
104810105Sadam.leventhal@sun.com 			dst[j] = rm->rm_col[cc].rc_data;
104910105Sadam.leventhal@sun.com 			dcount[j] = rm->rm_col[cc].rc_size;
105010105Sadam.leventhal@sun.com 		}
105110105Sadam.leventhal@sun.com 
105210105Sadam.leventhal@sun.com 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
105310105Sadam.leventhal@sun.com 
105410105Sadam.leventhal@sun.com 		for (x = 0; x < ccount; x++, src++) {
105510105Sadam.leventhal@sun.com 			if (*src != 0)
105610105Sadam.leventhal@sun.com 				log = vdev_raidz_log2[*src];
105710105Sadam.leventhal@sun.com 
105810105Sadam.leventhal@sun.com 			for (cc = 0; cc < nmissing; cc++) {
105910105Sadam.leventhal@sun.com 				if (x >= dcount[cc])
106010105Sadam.leventhal@sun.com 					continue;
106110105Sadam.leventhal@sun.com 
106210105Sadam.leventhal@sun.com 				if (*src == 0) {
106310105Sadam.leventhal@sun.com 					val = 0;
106410105Sadam.leventhal@sun.com 				} else {
106510105Sadam.leventhal@sun.com 					if ((ll = log + invlog[cc][i]) >= 255)
106610105Sadam.leventhal@sun.com 						ll -= 255;
106710105Sadam.leventhal@sun.com 					val = vdev_raidz_pow2[ll];
106810105Sadam.leventhal@sun.com 				}
106910105Sadam.leventhal@sun.com 
107010105Sadam.leventhal@sun.com 				if (i == 0)
107110105Sadam.leventhal@sun.com 					dst[cc][x] = val;
107210105Sadam.leventhal@sun.com 				else
107310105Sadam.leventhal@sun.com 					dst[cc][x] ^= val;
107410105Sadam.leventhal@sun.com 			}
107510105Sadam.leventhal@sun.com 		}
107610105Sadam.leventhal@sun.com 	}
107710105Sadam.leventhal@sun.com 
107810105Sadam.leventhal@sun.com 	kmem_free(p, psize);
107910105Sadam.leventhal@sun.com }
108010105Sadam.leventhal@sun.com 
108110105Sadam.leventhal@sun.com static int
108210105Sadam.leventhal@sun.com vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
108310105Sadam.leventhal@sun.com {
108410105Sadam.leventhal@sun.com 	int n, i, c, t, tt;
108510105Sadam.leventhal@sun.com 	int nmissing_rows;
108610105Sadam.leventhal@sun.com 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
108710105Sadam.leventhal@sun.com 	int parity_map[VDEV_RAIDZ_MAXPARITY];
108810105Sadam.leventhal@sun.com 
108910105Sadam.leventhal@sun.com 	uint8_t *p, *pp;
109010105Sadam.leventhal@sun.com 	size_t psize;
109110105Sadam.leventhal@sun.com 
109210105Sadam.leventhal@sun.com 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
109310105Sadam.leventhal@sun.com 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
109410105Sadam.leventhal@sun.com 	uint8_t *used;
109510105Sadam.leventhal@sun.com 
109610105Sadam.leventhal@sun.com 	int code = 0;
109710105Sadam.leventhal@sun.com 
109810105Sadam.leventhal@sun.com 
109910105Sadam.leventhal@sun.com 	n = rm->rm_cols - rm->rm_firstdatacol;
110010105Sadam.leventhal@sun.com 
110110105Sadam.leventhal@sun.com 	/*
110210105Sadam.leventhal@sun.com 	 * Figure out which data columns are missing.
110310105Sadam.leventhal@sun.com 	 */
110410105Sadam.leventhal@sun.com 	nmissing_rows = 0;
110510105Sadam.leventhal@sun.com 	for (t = 0; t < ntgts; t++) {
110610105Sadam.leventhal@sun.com 		if (tgts[t] >= rm->rm_firstdatacol) {
110710105Sadam.leventhal@sun.com 			missing_rows[nmissing_rows++] =
110810105Sadam.leventhal@sun.com 			    tgts[t] - rm->rm_firstdatacol;
110910105Sadam.leventhal@sun.com 		}
111010105Sadam.leventhal@sun.com 	}
111110105Sadam.leventhal@sun.com 
111210105Sadam.leventhal@sun.com 	/*
111310105Sadam.leventhal@sun.com 	 * Figure out which parity columns to use to help generate the missing
111410105Sadam.leventhal@sun.com 	 * data columns.
111510105Sadam.leventhal@sun.com 	 */
111610105Sadam.leventhal@sun.com 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
111710105Sadam.leventhal@sun.com 		ASSERT(tt < ntgts);
111810105Sadam.leventhal@sun.com 		ASSERT(c < rm->rm_firstdatacol);
111910105Sadam.leventhal@sun.com 
112010105Sadam.leventhal@sun.com 		/*
112110105Sadam.leventhal@sun.com 		 * Skip any targeted parity columns.
112210105Sadam.leventhal@sun.com 		 */
112310105Sadam.leventhal@sun.com 		if (c == tgts[tt]) {
112410105Sadam.leventhal@sun.com 			tt++;
112510105Sadam.leventhal@sun.com 			continue;
112610105Sadam.leventhal@sun.com 		}
112710105Sadam.leventhal@sun.com 
112810105Sadam.leventhal@sun.com 		code |= 1 << c;
112910105Sadam.leventhal@sun.com 
113010105Sadam.leventhal@sun.com 		parity_map[i] = c;
113110105Sadam.leventhal@sun.com 		i++;
113210105Sadam.leventhal@sun.com 	}
113310105Sadam.leventhal@sun.com 
113410105Sadam.leventhal@sun.com 	ASSERT(code != 0);
113510105Sadam.leventhal@sun.com 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
113610105Sadam.leventhal@sun.com 
113710105Sadam.leventhal@sun.com 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
113810105Sadam.leventhal@sun.com 	    nmissing_rows * n + sizeof (used[0]) * n;
113910105Sadam.leventhal@sun.com 	p = kmem_alloc(psize, KM_SLEEP);
114010105Sadam.leventhal@sun.com 
114110105Sadam.leventhal@sun.com 	for (pp = p, i = 0; i < nmissing_rows; i++) {
114210105Sadam.leventhal@sun.com 		rows[i] = pp;
114310105Sadam.leventhal@sun.com 		pp += n;
114410105Sadam.leventhal@sun.com 		invrows[i] = pp;
114510105Sadam.leventhal@sun.com 		pp += n;
114610105Sadam.leventhal@sun.com 	}
114710105Sadam.leventhal@sun.com 	used = pp;
114810105Sadam.leventhal@sun.com 
114910105Sadam.leventhal@sun.com 	for (i = 0; i < nmissing_rows; i++) {
115010105Sadam.leventhal@sun.com 		used[i] = parity_map[i];
115110105Sadam.leventhal@sun.com 	}
115210105Sadam.leventhal@sun.com 
115310105Sadam.leventhal@sun.com 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
115410105Sadam.leventhal@sun.com 		if (tt < nmissing_rows &&
115510105Sadam.leventhal@sun.com 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
115610105Sadam.leventhal@sun.com 			tt++;
115710105Sadam.leventhal@sun.com 			continue;
115810105Sadam.leventhal@sun.com 		}
115910105Sadam.leventhal@sun.com 
116010105Sadam.leventhal@sun.com 		ASSERT3S(i, <, n);
116110105Sadam.leventhal@sun.com 		used[i] = c;
116210105Sadam.leventhal@sun.com 		i++;
116310105Sadam.leventhal@sun.com 	}
116410105Sadam.leventhal@sun.com 
116510105Sadam.leventhal@sun.com 	/*
116610105Sadam.leventhal@sun.com 	 * Initialize the interesting rows of the matrix.
116710105Sadam.leventhal@sun.com 	 */
116810105Sadam.leventhal@sun.com 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
116910105Sadam.leventhal@sun.com 
117010105Sadam.leventhal@sun.com 	/*
117110105Sadam.leventhal@sun.com 	 * Invert the matrix.
117210105Sadam.leventhal@sun.com 	 */
117310105Sadam.leventhal@sun.com 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
117410105Sadam.leventhal@sun.com 	    invrows, used);
117510105Sadam.leventhal@sun.com 
117610105Sadam.leventhal@sun.com 	/*
117710105Sadam.leventhal@sun.com 	 * Reconstruct the missing data using the generated matrix.
117810105Sadam.leventhal@sun.com 	 */
117910105Sadam.leventhal@sun.com 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
118010105Sadam.leventhal@sun.com 	    invrows, used);
118110105Sadam.leventhal@sun.com 
118210105Sadam.leventhal@sun.com 	kmem_free(p, psize);
118310105Sadam.leventhal@sun.com 
118410105Sadam.leventhal@sun.com 	return (code);
118510105Sadam.leventhal@sun.com }
118610105Sadam.leventhal@sun.com 
118710105Sadam.leventhal@sun.com static int
118810105Sadam.leventhal@sun.com vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
118910105Sadam.leventhal@sun.com {
119010105Sadam.leventhal@sun.com 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
119110105Sadam.leventhal@sun.com 	int ntgts;
119210105Sadam.leventhal@sun.com 	int i, c;
119310105Sadam.leventhal@sun.com 	int code;
119410105Sadam.leventhal@sun.com 	int nbadparity, nbaddata;
119510105Sadam.leventhal@sun.com 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
119610105Sadam.leventhal@sun.com 
119710105Sadam.leventhal@sun.com 	/*
119810105Sadam.leventhal@sun.com 	 * The tgts list must already be sorted.
119910105Sadam.leventhal@sun.com 	 */
120010105Sadam.leventhal@sun.com 	for (i = 1; i < nt; i++) {
120110105Sadam.leventhal@sun.com 		ASSERT(t[i] > t[i - 1]);
120210105Sadam.leventhal@sun.com 	}
120310105Sadam.leventhal@sun.com 
120410105Sadam.leventhal@sun.com 	nbadparity = rm->rm_firstdatacol;
120510105Sadam.leventhal@sun.com 	nbaddata = rm->rm_cols - nbadparity;
120610105Sadam.leventhal@sun.com 	ntgts = 0;
120710105Sadam.leventhal@sun.com 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
120810105Sadam.leventhal@sun.com 		if (c < rm->rm_firstdatacol)
120910105Sadam.leventhal@sun.com 			parity_valid[c] = B_FALSE;
121010105Sadam.leventhal@sun.com 
121110105Sadam.leventhal@sun.com 		if (i < nt && c == t[i]) {
121210105Sadam.leventhal@sun.com 			tgts[ntgts++] = c;
121310105Sadam.leventhal@sun.com 			i++;
121410105Sadam.leventhal@sun.com 		} else if (rm->rm_col[c].rc_error != 0) {
121510105Sadam.leventhal@sun.com 			tgts[ntgts++] = c;
121610105Sadam.leventhal@sun.com 		} else if (c >= rm->rm_firstdatacol) {
121710105Sadam.leventhal@sun.com 			nbaddata--;
121810105Sadam.leventhal@sun.com 		} else {
121910105Sadam.leventhal@sun.com 			parity_valid[c] = B_TRUE;
122010105Sadam.leventhal@sun.com 			nbadparity--;
122110105Sadam.leventhal@sun.com 		}
122210105Sadam.leventhal@sun.com 	}
122310105Sadam.leventhal@sun.com 
122410105Sadam.leventhal@sun.com 	ASSERT(ntgts >= nt);
122510105Sadam.leventhal@sun.com 	ASSERT(nbaddata >= 0);
122610105Sadam.leventhal@sun.com 	ASSERT(nbaddata + nbadparity == ntgts);
122710105Sadam.leventhal@sun.com 
122810105Sadam.leventhal@sun.com 	dt = &tgts[nbadparity];
122910105Sadam.leventhal@sun.com 
123010105Sadam.leventhal@sun.com 	/*
123110105Sadam.leventhal@sun.com 	 * See if we can use any of our optimized reconstruction routines.
123210105Sadam.leventhal@sun.com 	 */
123310105Sadam.leventhal@sun.com 	if (!vdev_raidz_default_to_general) {
123410105Sadam.leventhal@sun.com 		switch (nbaddata) {
123510105Sadam.leventhal@sun.com 		case 1:
123610105Sadam.leventhal@sun.com 			if (parity_valid[VDEV_RAIDZ_P])
123710105Sadam.leventhal@sun.com 				return (vdev_raidz_reconstruct_p(rm, dt, 1));
123810105Sadam.leventhal@sun.com 
123910105Sadam.leventhal@sun.com 			ASSERT(rm->rm_firstdatacol > 1);
124010105Sadam.leventhal@sun.com 
124110105Sadam.leventhal@sun.com 			if (parity_valid[VDEV_RAIDZ_Q])
124210105Sadam.leventhal@sun.com 				return (vdev_raidz_reconstruct_q(rm, dt, 1));
124310105Sadam.leventhal@sun.com 
124410105Sadam.leventhal@sun.com 			ASSERT(rm->rm_firstdatacol > 2);
124510105Sadam.leventhal@sun.com 			break;
124610105Sadam.leventhal@sun.com 
124710105Sadam.leventhal@sun.com 		case 2:
124810105Sadam.leventhal@sun.com 			ASSERT(rm->rm_firstdatacol > 1);
124910105Sadam.leventhal@sun.com 
125010105Sadam.leventhal@sun.com 			if (parity_valid[VDEV_RAIDZ_P] &&
125110105Sadam.leventhal@sun.com 			    parity_valid[VDEV_RAIDZ_Q])
125210105Sadam.leventhal@sun.com 				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
125310105Sadam.leventhal@sun.com 
125410105Sadam.leventhal@sun.com 			ASSERT(rm->rm_firstdatacol > 2);
125510105Sadam.leventhal@sun.com 
125610105Sadam.leventhal@sun.com 			break;
125710105Sadam.leventhal@sun.com 		}
125810105Sadam.leventhal@sun.com 	}
125910105Sadam.leventhal@sun.com 
126010105Sadam.leventhal@sun.com 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
126110105Sadam.leventhal@sun.com 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
126210105Sadam.leventhal@sun.com 	ASSERT(code > 0);
126310105Sadam.leventhal@sun.com 	return (code);
126410105Sadam.leventhal@sun.com }
12652082Seschrock 
1266789Sahrens static int
1267789Sahrens vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
1268789Sahrens {
126910105Sadam.leventhal@sun.com 	vdev_t *cvd;
12702082Seschrock 	uint64_t nparity = vd->vdev_nparity;
127110105Sadam.leventhal@sun.com 	int c;
1272789Sahrens 	int lasterror = 0;
1273789Sahrens 	int numerrors = 0;
1274789Sahrens 
12752082Seschrock 	ASSERT(nparity > 0);
12762082Seschrock 
12772082Seschrock 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
12782082Seschrock 	    vd->vdev_children < nparity + 1) {
1279789Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1280789Sahrens 		return (EINVAL);
1281789Sahrens 	}
1282789Sahrens 
12839846SEric.Taylor@Sun.COM 	vdev_open_children(vd);
1284789Sahrens 
128510105Sadam.leventhal@sun.com 	for (c = 0; c < vd->vdev_children; c++) {
128610105Sadam.leventhal@sun.com 		cvd = vd->vdev_child[c];
12879846SEric.Taylor@Sun.COM 
128810105Sadam.leventhal@sun.com 		if (cvd->vdev_open_error != 0) {
12899846SEric.Taylor@Sun.COM 			lasterror = cvd->vdev_open_error;
1290789Sahrens 			numerrors++;
1291789Sahrens 			continue;
1292789Sahrens 		}
1293789Sahrens 
1294789Sahrens 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
12951732Sbonwick 		*ashift = MAX(*ashift, cvd->vdev_ashift);
1296789Sahrens 	}
1297789Sahrens 
1298789Sahrens 	*asize *= vd->vdev_children;
1299789Sahrens 
13002082Seschrock 	if (numerrors > nparity) {
1301789Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1302789Sahrens 		return (lasterror);
1303789Sahrens 	}
1304789Sahrens 
1305789Sahrens 	return (0);
1306789Sahrens }
1307789Sahrens 
1308789Sahrens static void
1309789Sahrens vdev_raidz_close(vdev_t *vd)
1310789Sahrens {
131110105Sadam.leventhal@sun.com 	int c;
131210105Sadam.leventhal@sun.com 
131310105Sadam.leventhal@sun.com 	for (c = 0; c < vd->vdev_children; c++)
1314789Sahrens 		vdev_close(vd->vdev_child[c]);
1315789Sahrens }
1316789Sahrens 
1317789Sahrens static uint64_t
1318789Sahrens vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1319789Sahrens {
1320789Sahrens 	uint64_t asize;
13211732Sbonwick 	uint64_t ashift = vd->vdev_top->vdev_ashift;
1322789Sahrens 	uint64_t cols = vd->vdev_children;
13232082Seschrock 	uint64_t nparity = vd->vdev_nparity;
1324789Sahrens 
13251732Sbonwick 	asize = ((psize - 1) >> ashift) + 1;
13262082Seschrock 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
13272082Seschrock 	asize = roundup(asize, nparity + 1) << ashift;
1328789Sahrens 
1329789Sahrens 	return (asize);
1330789Sahrens }
1331789Sahrens 
1332789Sahrens static void
1333789Sahrens vdev_raidz_child_done(zio_t *zio)
1334789Sahrens {
1335789Sahrens 	raidz_col_t *rc = zio->io_private;
1336789Sahrens 
1337789Sahrens 	rc->rc_error = zio->io_error;
1338789Sahrens 	rc->rc_tried = 1;
1339789Sahrens 	rc->rc_skipped = 0;
1340789Sahrens }
1341789Sahrens 
13425530Sbonwick static int
1343789Sahrens vdev_raidz_io_start(zio_t *zio)
1344789Sahrens {
1345789Sahrens 	vdev_t *vd = zio->io_vd;
13461732Sbonwick 	vdev_t *tvd = vd->vdev_top;
1347789Sahrens 	vdev_t *cvd;
1348789Sahrens 	blkptr_t *bp = zio->io_bp;
1349789Sahrens 	raidz_map_t *rm;
1350789Sahrens 	raidz_col_t *rc;
135110105Sadam.leventhal@sun.com 	int c, i;
1352789Sahrens 
13532082Seschrock 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
13542082Seschrock 	    vd->vdev_nparity);
1355789Sahrens 
13561775Sbillm 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1357789Sahrens 
1358789Sahrens 	if (zio->io_type == ZIO_TYPE_WRITE) {
135910105Sadam.leventhal@sun.com 		vdev_raidz_generate_parity(rm);
1360789Sahrens 
1361789Sahrens 		for (c = 0; c < rm->rm_cols; c++) {
1362789Sahrens 			rc = &rm->rm_col[c];
13632082Seschrock 			cvd = vd->vdev_child[rc->rc_devidx];
1364789Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1365789Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
13667754SJeff.Bonwick@Sun.COM 			    zio->io_type, zio->io_priority, 0,
1367789Sahrens 			    vdev_raidz_child_done, rc));
1368789Sahrens 		}
13695530Sbonwick 
137010105Sadam.leventhal@sun.com 		/*
137110105Sadam.leventhal@sun.com 		 * Generate optional I/Os for any skipped sectors to improve
137210105Sadam.leventhal@sun.com 		 * aggregation contiguity.
137310105Sadam.leventhal@sun.com 		 */
1374*10450Sadam.leventhal@sun.com 		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
137510105Sadam.leventhal@sun.com 			ASSERT(c <= rm->rm_scols);
137610105Sadam.leventhal@sun.com 			if (c == rm->rm_scols)
137710105Sadam.leventhal@sun.com 				c = 0;
137810105Sadam.leventhal@sun.com 			rc = &rm->rm_col[c];
137910105Sadam.leventhal@sun.com 			cvd = vd->vdev_child[rc->rc_devidx];
138010105Sadam.leventhal@sun.com 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
138110105Sadam.leventhal@sun.com 			    rc->rc_offset + rc->rc_size, NULL,
138210105Sadam.leventhal@sun.com 			    1 << tvd->vdev_ashift,
138310105Sadam.leventhal@sun.com 			    zio->io_type, zio->io_priority,
138410105Sadam.leventhal@sun.com 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
138510105Sadam.leventhal@sun.com 		}
138610105Sadam.leventhal@sun.com 
13877754SJeff.Bonwick@Sun.COM 		return (ZIO_PIPELINE_CONTINUE);
1388789Sahrens 	}
1389789Sahrens 
1390789Sahrens 	ASSERT(zio->io_type == ZIO_TYPE_READ);
1391789Sahrens 
13922082Seschrock 	/*
13932082Seschrock 	 * Iterate over the columns in reverse order so that we hit the parity
139410105Sadam.leventhal@sun.com 	 * last -- any errors along the way will force us to read the parity.
13952082Seschrock 	 */
1396789Sahrens 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1397789Sahrens 		rc = &rm->rm_col[c];
13982082Seschrock 		cvd = vd->vdev_child[rc->rc_devidx];
13995329Sgw25295 		if (!vdev_readable(cvd)) {
14002082Seschrock 			if (c >= rm->rm_firstdatacol)
14012082Seschrock 				rm->rm_missingdata++;
14022082Seschrock 			else
14032082Seschrock 				rm->rm_missingparity++;
1404789Sahrens 			rc->rc_error = ENXIO;
1405789Sahrens 			rc->rc_tried = 1;	/* don't even try */
1406789Sahrens 			rc->rc_skipped = 1;
1407789Sahrens 			continue;
1408789Sahrens 		}
14098241SJeff.Bonwick@Sun.COM 		if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) {
14102082Seschrock 			if (c >= rm->rm_firstdatacol)
14112082Seschrock 				rm->rm_missingdata++;
14122082Seschrock 			else
14132082Seschrock 				rm->rm_missingparity++;
1414789Sahrens 			rc->rc_error = ESTALE;
1415789Sahrens 			rc->rc_skipped = 1;
1416789Sahrens 			continue;
1417789Sahrens 		}
14182082Seschrock 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
14199434SMark.Musante@Sun.COM 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1420789Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1421789Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
14227754SJeff.Bonwick@Sun.COM 			    zio->io_type, zio->io_priority, 0,
1423789Sahrens 			    vdev_raidz_child_done, rc));
1424789Sahrens 		}
1425789Sahrens 	}
1426789Sahrens 
14277754SJeff.Bonwick@Sun.COM 	return (ZIO_PIPELINE_CONTINUE);
1428789Sahrens }
1429789Sahrens 
14301544Seschrock /*
14311544Seschrock  * Report a checksum error for a child of a RAID-Z device.
14321544Seschrock  */
14331544Seschrock static void
14341544Seschrock raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
14351544Seschrock {
14362082Seschrock 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
14371544Seschrock 
14381544Seschrock 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
14391544Seschrock 		mutex_enter(&vd->vdev_stat_lock);
14401544Seschrock 		vd->vdev_stat.vs_checksum_errors++;
14411544Seschrock 		mutex_exit(&vd->vdev_stat_lock);
14421544Seschrock 	}
14431544Seschrock 
14441544Seschrock 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
14451544Seschrock 		zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
14461544Seschrock 		    zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
14471544Seschrock }
14481544Seschrock 
14492082Seschrock /*
14502082Seschrock  * Generate the parity from the data columns. If we tried and were able to
14512082Seschrock  * read the parity without error, verify that the generated parity matches the
14522082Seschrock  * data we read. If it doesn't, we fire off a checksum error. Return the
14532082Seschrock  * number such failures.
14542082Seschrock  */
14552082Seschrock static int
14562082Seschrock raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
14572082Seschrock {
14582082Seschrock 	void *orig[VDEV_RAIDZ_MAXPARITY];
14592082Seschrock 	int c, ret = 0;
14602082Seschrock 	raidz_col_t *rc;
14612082Seschrock 
14622082Seschrock 	for (c = 0; c < rm->rm_firstdatacol; c++) {
14632082Seschrock 		rc = &rm->rm_col[c];
14642082Seschrock 		if (!rc->rc_tried || rc->rc_error != 0)
14652082Seschrock 			continue;
14662082Seschrock 		orig[c] = zio_buf_alloc(rc->rc_size);
14672082Seschrock 		bcopy(rc->rc_data, orig[c], rc->rc_size);
14682082Seschrock 	}
14692082Seschrock 
147010105Sadam.leventhal@sun.com 	vdev_raidz_generate_parity(rm);
14712082Seschrock 
14722082Seschrock 	for (c = 0; c < rm->rm_firstdatacol; c++) {
14732082Seschrock 		rc = &rm->rm_col[c];
14742082Seschrock 		if (!rc->rc_tried || rc->rc_error != 0)
14752082Seschrock 			continue;
14762082Seschrock 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
14772082Seschrock 			raidz_checksum_error(zio, rc);
14782082Seschrock 			rc->rc_error = ECKSUM;
14792082Seschrock 			ret++;
14802082Seschrock 		}
14812082Seschrock 		zio_buf_free(orig[c], rc->rc_size);
14822082Seschrock 	}
14832082Seschrock 
14842082Seschrock 	return (ret);
14852082Seschrock }
14862082Seschrock 
148710105Sadam.leventhal@sun.com /*
148810105Sadam.leventhal@sun.com  * Keep statistics on all the ways that we used parity to correct data.
148910105Sadam.leventhal@sun.com  */
149010105Sadam.leventhal@sun.com static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
14911544Seschrock 
14925530Sbonwick static int
14937754SJeff.Bonwick@Sun.COM vdev_raidz_worst_error(raidz_map_t *rm)
14947754SJeff.Bonwick@Sun.COM {
14957754SJeff.Bonwick@Sun.COM 	int error = 0;
14967754SJeff.Bonwick@Sun.COM 
14977754SJeff.Bonwick@Sun.COM 	for (int c = 0; c < rm->rm_cols; c++)
14987754SJeff.Bonwick@Sun.COM 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
14997754SJeff.Bonwick@Sun.COM 
15007754SJeff.Bonwick@Sun.COM 	return (error);
15017754SJeff.Bonwick@Sun.COM }
15027754SJeff.Bonwick@Sun.COM 
150310105Sadam.leventhal@sun.com /*
150410105Sadam.leventhal@sun.com  * Iterate over all combinations of bad data and attempt a reconstruction.
150510105Sadam.leventhal@sun.com  * Note that the algorithm below is non-optimal because it doesn't take into
150610105Sadam.leventhal@sun.com  * account how reconstruction is actually performed. For example, with
150710105Sadam.leventhal@sun.com  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
150810105Sadam.leventhal@sun.com  * is targeted as invalid as if columns 1 and 4 are targeted since in both
150910105Sadam.leventhal@sun.com  * cases we'd only use parity information in column 0.
151010105Sadam.leventhal@sun.com  */
151110105Sadam.leventhal@sun.com static int
151210105Sadam.leventhal@sun.com vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
151310105Sadam.leventhal@sun.com {
151410105Sadam.leventhal@sun.com 	raidz_map_t *rm = zio->io_vsd;
151510105Sadam.leventhal@sun.com 	raidz_col_t *rc;
151610105Sadam.leventhal@sun.com 	void *orig[VDEV_RAIDZ_MAXPARITY];
151710105Sadam.leventhal@sun.com 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
151810105Sadam.leventhal@sun.com 	int *tgts = &tstore[1];
151910105Sadam.leventhal@sun.com 	int current, next, i, c, n;
152010105Sadam.leventhal@sun.com 	int code, ret = 0;
152110105Sadam.leventhal@sun.com 
152210105Sadam.leventhal@sun.com 	ASSERT(total_errors < rm->rm_firstdatacol);
152310105Sadam.leventhal@sun.com 
152410105Sadam.leventhal@sun.com 	/*
152510105Sadam.leventhal@sun.com 	 * This simplifies one edge condition.
152610105Sadam.leventhal@sun.com 	 */
152710105Sadam.leventhal@sun.com 	tgts[-1] = -1;
152810105Sadam.leventhal@sun.com 
152910105Sadam.leventhal@sun.com 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
153010105Sadam.leventhal@sun.com 		/*
153110105Sadam.leventhal@sun.com 		 * Initialize the targets array by finding the first n columns
153210105Sadam.leventhal@sun.com 		 * that contain no error.
153310105Sadam.leventhal@sun.com 		 *
153410105Sadam.leventhal@sun.com 		 * If there were no data errors, we need to ensure that we're
153510105Sadam.leventhal@sun.com 		 * always explicitly attempting to reconstruct at least one
153610105Sadam.leventhal@sun.com 		 * data column. To do this, we simply push the highest target
153710105Sadam.leventhal@sun.com 		 * up into the data columns.
153810105Sadam.leventhal@sun.com 		 */
153910105Sadam.leventhal@sun.com 		for (c = 0, i = 0; i < n; i++) {
154010105Sadam.leventhal@sun.com 			if (i == n - 1 && data_errors == 0 &&
154110105Sadam.leventhal@sun.com 			    c < rm->rm_firstdatacol) {
154210105Sadam.leventhal@sun.com 				c = rm->rm_firstdatacol;
154310105Sadam.leventhal@sun.com 			}
154410105Sadam.leventhal@sun.com 
154510105Sadam.leventhal@sun.com 			while (rm->rm_col[c].rc_error != 0) {
154610105Sadam.leventhal@sun.com 				c++;
154710105Sadam.leventhal@sun.com 				ASSERT3S(c, <, rm->rm_cols);
154810105Sadam.leventhal@sun.com 			}
154910105Sadam.leventhal@sun.com 
155010105Sadam.leventhal@sun.com 			tgts[i] = c++;
155110105Sadam.leventhal@sun.com 		}
155210105Sadam.leventhal@sun.com 
155310105Sadam.leventhal@sun.com 		/*
155410105Sadam.leventhal@sun.com 		 * Setting tgts[n] simplifies the other edge condition.
155510105Sadam.leventhal@sun.com 		 */
155610105Sadam.leventhal@sun.com 		tgts[n] = rm->rm_cols;
155710105Sadam.leventhal@sun.com 
155810105Sadam.leventhal@sun.com 		/*
155910105Sadam.leventhal@sun.com 		 * These buffers were allocated in previous iterations.
156010105Sadam.leventhal@sun.com 		 */
156110105Sadam.leventhal@sun.com 		for (i = 0; i < n - 1; i++) {
156210105Sadam.leventhal@sun.com 			ASSERT(orig[i] != NULL);
156310105Sadam.leventhal@sun.com 		}
156410105Sadam.leventhal@sun.com 
156510105Sadam.leventhal@sun.com 		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
156610105Sadam.leventhal@sun.com 
156710105Sadam.leventhal@sun.com 		current = 0;
156810105Sadam.leventhal@sun.com 		next = tgts[current];
156910105Sadam.leventhal@sun.com 
157010105Sadam.leventhal@sun.com 		while (current != n) {
157110105Sadam.leventhal@sun.com 			tgts[current] = next;
157210105Sadam.leventhal@sun.com 			current = 0;
157310105Sadam.leventhal@sun.com 
157410105Sadam.leventhal@sun.com 			/*
157510105Sadam.leventhal@sun.com 			 * Save off the original data that we're going to
157610105Sadam.leventhal@sun.com 			 * attempt to reconstruct.
157710105Sadam.leventhal@sun.com 			 */
157810105Sadam.leventhal@sun.com 			for (i = 0; i < n; i++) {
157910105Sadam.leventhal@sun.com 				ASSERT(orig[i] != NULL);
158010105Sadam.leventhal@sun.com 				c = tgts[i];
158110105Sadam.leventhal@sun.com 				ASSERT3S(c, >=, 0);
158210105Sadam.leventhal@sun.com 				ASSERT3S(c, <, rm->rm_cols);
158310105Sadam.leventhal@sun.com 				rc = &rm->rm_col[c];
158410105Sadam.leventhal@sun.com 				bcopy(rc->rc_data, orig[i], rc->rc_size);
158510105Sadam.leventhal@sun.com 			}
158610105Sadam.leventhal@sun.com 
158710105Sadam.leventhal@sun.com 			/*
158810105Sadam.leventhal@sun.com 			 * Attempt a reconstruction and exit the outer loop on
158910105Sadam.leventhal@sun.com 			 * success.
159010105Sadam.leventhal@sun.com 			 */
159110105Sadam.leventhal@sun.com 			code = vdev_raidz_reconstruct(rm, tgts, n);
159210105Sadam.leventhal@sun.com 			if (zio_checksum_error(zio) == 0) {
159310105Sadam.leventhal@sun.com 				atomic_inc_64(&raidz_corrected[code]);
159410105Sadam.leventhal@sun.com 
159510105Sadam.leventhal@sun.com 				for (i = 0; i < n; i++) {
159610105Sadam.leventhal@sun.com 					c = tgts[i];
159710105Sadam.leventhal@sun.com 					rc = &rm->rm_col[c];
159810105Sadam.leventhal@sun.com 					ASSERT(rc->rc_error == 0);
1599*10450Sadam.leventhal@sun.com 					if (rc->rc_tried) {
1600*10450Sadam.leventhal@sun.com 						if (bcmp(orig[i], rc->rc_data,
1601*10450Sadam.leventhal@sun.com 						    rc->rc_size) == 0)
1602*10450Sadam.leventhal@sun.com 							continue;
160310105Sadam.leventhal@sun.com 						raidz_checksum_error(zio, rc);
1604*10450Sadam.leventhal@sun.com 					}
160510105Sadam.leventhal@sun.com 					rc->rc_error = ECKSUM;
160610105Sadam.leventhal@sun.com 				}
160710105Sadam.leventhal@sun.com 
160810105Sadam.leventhal@sun.com 				ret = code;
160910105Sadam.leventhal@sun.com 				goto done;
161010105Sadam.leventhal@sun.com 			}
161110105Sadam.leventhal@sun.com 
161210105Sadam.leventhal@sun.com 			/*
161310105Sadam.leventhal@sun.com 			 * Restore the original data.
161410105Sadam.leventhal@sun.com 			 */
161510105Sadam.leventhal@sun.com 			for (i = 0; i < n; i++) {
161610105Sadam.leventhal@sun.com 				c = tgts[i];
161710105Sadam.leventhal@sun.com 				rc = &rm->rm_col[c];
161810105Sadam.leventhal@sun.com 				bcopy(orig[i], rc->rc_data, rc->rc_size);
161910105Sadam.leventhal@sun.com 			}
162010105Sadam.leventhal@sun.com 
162110105Sadam.leventhal@sun.com 			do {
162210105Sadam.leventhal@sun.com 				/*
162310105Sadam.leventhal@sun.com 				 * Find the next valid column after the current
162410105Sadam.leventhal@sun.com 				 * position..
162510105Sadam.leventhal@sun.com 				 */
162610105Sadam.leventhal@sun.com 				for (next = tgts[current] + 1;
162710105Sadam.leventhal@sun.com 				    next < rm->rm_cols &&
162810105Sadam.leventhal@sun.com 				    rm->rm_col[next].rc_error != 0; next++)
162910105Sadam.leventhal@sun.com 					continue;
163010105Sadam.leventhal@sun.com 
163110105Sadam.leventhal@sun.com 				ASSERT(next <= tgts[current + 1]);
163210105Sadam.leventhal@sun.com 
163310105Sadam.leventhal@sun.com 				/*
163410105Sadam.leventhal@sun.com 				 * If that spot is available, we're done here.
163510105Sadam.leventhal@sun.com 				 */
163610105Sadam.leventhal@sun.com 				if (next != tgts[current + 1])
163710105Sadam.leventhal@sun.com 					break;
163810105Sadam.leventhal@sun.com 
163910105Sadam.leventhal@sun.com 				/*
164010105Sadam.leventhal@sun.com 				 * Otherwise, find the next valid column after
164110105Sadam.leventhal@sun.com 				 * the previous position.
164210105Sadam.leventhal@sun.com 				 */
164310105Sadam.leventhal@sun.com 				for (c = tgts[current - 1] + 1;
164410105Sadam.leventhal@sun.com 				    rm->rm_col[c].rc_error != 0; c++)
164510105Sadam.leventhal@sun.com 					continue;
164610105Sadam.leventhal@sun.com 
164710105Sadam.leventhal@sun.com 				tgts[current] = c;
164810105Sadam.leventhal@sun.com 				current++;
164910105Sadam.leventhal@sun.com 
165010105Sadam.leventhal@sun.com 			} while (current != n);
165110105Sadam.leventhal@sun.com 		}
165210105Sadam.leventhal@sun.com 	}
165310105Sadam.leventhal@sun.com 	n--;
165410105Sadam.leventhal@sun.com done:
165510105Sadam.leventhal@sun.com 	for (i = 0; i < n; i++) {
165610105Sadam.leventhal@sun.com 		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
165710105Sadam.leventhal@sun.com 	}
165810105Sadam.leventhal@sun.com 
165910105Sadam.leventhal@sun.com 	return (ret);
166010105Sadam.leventhal@sun.com }
166110105Sadam.leventhal@sun.com 
16627754SJeff.Bonwick@Sun.COM static void
1663789Sahrens vdev_raidz_io_done(zio_t *zio)
1664789Sahrens {
1665789Sahrens 	vdev_t *vd = zio->io_vd;
1666789Sahrens 	vdev_t *cvd;
1667789Sahrens 	raidz_map_t *rm = zio->io_vsd;
166810105Sadam.leventhal@sun.com 	raidz_col_t *rc;
1669789Sahrens 	int unexpected_errors = 0;
16702082Seschrock 	int parity_errors = 0;
16713456Sahl 	int parity_untried = 0;
16722082Seschrock 	int data_errors = 0;
16737754SJeff.Bonwick@Sun.COM 	int total_errors = 0;
167410105Sadam.leventhal@sun.com 	int n, c;
167510105Sadam.leventhal@sun.com 	int tgts[VDEV_RAIDZ_MAXPARITY];
167610105Sadam.leventhal@sun.com 	int code;
1677789Sahrens 
16781775Sbillm 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
1679789Sahrens 
16802082Seschrock 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
16812082Seschrock 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
16822082Seschrock 
1683789Sahrens 	for (c = 0; c < rm->rm_cols; c++) {
1684789Sahrens 		rc = &rm->rm_col[c];
1685789Sahrens 
1686789Sahrens 		if (rc->rc_error) {
16877754SJeff.Bonwick@Sun.COM 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
16882082Seschrock 
16892082Seschrock 			if (c < rm->rm_firstdatacol)
16902082Seschrock 				parity_errors++;
16912082Seschrock 			else
16922082Seschrock 				data_errors++;
16932082Seschrock 
1694789Sahrens 			if (!rc->rc_skipped)
1695789Sahrens 				unexpected_errors++;
16962082Seschrock 
16977754SJeff.Bonwick@Sun.COM 			total_errors++;
16983456Sahl 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
16993456Sahl 			parity_untried++;
1700789Sahrens 		}
1701789Sahrens 	}
1702789Sahrens 
1703789Sahrens 	if (zio->io_type == ZIO_TYPE_WRITE) {
1704789Sahrens 		/*
17057754SJeff.Bonwick@Sun.COM 		 * XXX -- for now, treat partial writes as a success.
17067754SJeff.Bonwick@Sun.COM 		 * (If we couldn't write enough columns to reconstruct
17077754SJeff.Bonwick@Sun.COM 		 * the data, the I/O failed.  Otherwise, good enough.)
17087754SJeff.Bonwick@Sun.COM 		 *
17097754SJeff.Bonwick@Sun.COM 		 * Now that we support write reallocation, it would be better
17107754SJeff.Bonwick@Sun.COM 		 * to treat partial failure as real failure unless there are
17117754SJeff.Bonwick@Sun.COM 		 * no non-degraded top-level vdevs left, and not update DTLs
17127754SJeff.Bonwick@Sun.COM 		 * if we intend to reallocate.
1713789Sahrens 		 */
1714789Sahrens 		/* XXPOLICY */
17157754SJeff.Bonwick@Sun.COM 		if (total_errors > rm->rm_firstdatacol)
17167754SJeff.Bonwick@Sun.COM 			zio->io_error = vdev_raidz_worst_error(rm);
1717789Sahrens 
17187754SJeff.Bonwick@Sun.COM 		return;
1719789Sahrens 	}
1720789Sahrens 
1721789Sahrens 	ASSERT(zio->io_type == ZIO_TYPE_READ);
17222082Seschrock 	/*
17232082Seschrock 	 * There are three potential phases for a read:
17242082Seschrock 	 *	1. produce valid data from the columns read
17252082Seschrock 	 *	2. read all disks and try again
17262082Seschrock 	 *	3. perform combinatorial reconstruction
17272082Seschrock 	 *
17282082Seschrock 	 * Each phase is progressively both more expensive and less likely to
17292082Seschrock 	 * occur. If we encounter more errors than we can repair or all phases
17302082Seschrock 	 * fail, we have no choice but to return an error.
17312082Seschrock 	 */
1732789Sahrens 
1733789Sahrens 	/*
17342082Seschrock 	 * If the number of errors we saw was correctable -- less than or equal
17353456Sahl 	 * to the number of parity disks read -- attempt to produce data that
17363456Sahl 	 * has a valid checksum. Naturally, this case applies in the absence of
17373456Sahl 	 * any errors.
1738789Sahrens 	 */
17397754SJeff.Bonwick@Sun.COM 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
174010105Sadam.leventhal@sun.com 		if (data_errors == 0) {
17412082Seschrock 			if (zio_checksum_error(zio) == 0) {
17424034Sahl 				/*
17434034Sahl 				 * If we read parity information (unnecessarily
17444034Sahl 				 * as it happens since no reconstruction was
17454034Sahl 				 * needed) regenerate and verify the parity.
17464034Sahl 				 * We also regenerate parity when resilvering
17474034Sahl 				 * so we can write it out to the failed device
17484034Sahl 				 * later.
17494034Sahl 				 */
17503456Sahl 				if (parity_errors + parity_untried <
17514034Sahl 				    rm->rm_firstdatacol ||
17524034Sahl 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
17533456Sahl 					n = raidz_parity_verify(zio, rm);
17543456Sahl 					unexpected_errors += n;
17553456Sahl 					ASSERT(parity_errors + n <=
17563456Sahl 					    rm->rm_firstdatacol);
17573456Sahl 				}
17582082Seschrock 				goto done;
17592082Seschrock 			}
176010105Sadam.leventhal@sun.com 		} else {
17613456Sahl 			/*
17623456Sahl 			 * We either attempt to read all the parity columns or
17633456Sahl 			 * none of them. If we didn't try to read parity, we
17643456Sahl 			 * wouldn't be here in the correctable case. There must
17653456Sahl 			 * also have been fewer parity errors than parity
17663456Sahl 			 * columns or, again, we wouldn't be in this code path.
17673456Sahl 			 */
17683456Sahl 			ASSERT(parity_untried == 0);
17692082Seschrock 			ASSERT(parity_errors < rm->rm_firstdatacol);
17702082Seschrock 
17712082Seschrock 			/*
177210105Sadam.leventhal@sun.com 			 * Identify the data columns that reported an error.
17732082Seschrock 			 */
177410105Sadam.leventhal@sun.com 			n = 0;
17752082Seschrock 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
17762082Seschrock 				rc = &rm->rm_col[c];
177710105Sadam.leventhal@sun.com 				if (rc->rc_error != 0) {
177810105Sadam.leventhal@sun.com 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
177910105Sadam.leventhal@sun.com 					tgts[n++] = c;
178010105Sadam.leventhal@sun.com 				}
17812082Seschrock 			}
17822082Seschrock 
178310105Sadam.leventhal@sun.com 			ASSERT(rm->rm_firstdatacol >= n);
178410105Sadam.leventhal@sun.com 
178510105Sadam.leventhal@sun.com 			code = vdev_raidz_reconstruct(rm, tgts, n);
17862082Seschrock 
17872082Seschrock 			if (zio_checksum_error(zio) == 0) {
178810105Sadam.leventhal@sun.com 				atomic_inc_64(&raidz_corrected[code]);
1789789Sahrens 
17902082Seschrock 				/*
179110105Sadam.leventhal@sun.com 				 * If we read more parity disks than were used
179210105Sadam.leventhal@sun.com 				 * for reconstruction, confirm that the other
179310105Sadam.leventhal@sun.com 				 * parity disks produced correct data. This
179410105Sadam.leventhal@sun.com 				 * routine is suboptimal in that it regenerates
179510105Sadam.leventhal@sun.com 				 * the parity that we already used in addition
179610105Sadam.leventhal@sun.com 				 * to the parity that we're attempting to
179710105Sadam.leventhal@sun.com 				 * verify, but this should be a relatively
179810105Sadam.leventhal@sun.com 				 * uncommon case, and can be optimized if it
179910105Sadam.leventhal@sun.com 				 * becomes a problem. Note that we regenerate
180010105Sadam.leventhal@sun.com 				 * parity when resilvering so we can write it
180110105Sadam.leventhal@sun.com 				 * out to failed devices later.
18022082Seschrock 				 */
180310105Sadam.leventhal@sun.com 				if (parity_errors < rm->rm_firstdatacol - n ||
18044034Sahl 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
18052082Seschrock 					n = raidz_parity_verify(zio, rm);
18062082Seschrock 					unexpected_errors += n;
18072082Seschrock 					ASSERT(parity_errors + n <=
18082082Seschrock 					    rm->rm_firstdatacol);
18092082Seschrock 				}
18102082Seschrock 
18112082Seschrock 				goto done;
18122082Seschrock 			}
1813789Sahrens 		}
1814789Sahrens 	}
1815789Sahrens 
1816789Sahrens 	/*
18172082Seschrock 	 * This isn't a typical situation -- either we got a read error or
18182082Seschrock 	 * a child silently returned bad data. Read every block so we can
18192082Seschrock 	 * try again with as much data and parity as we can track down. If
18202082Seschrock 	 * we've already been through once before, all children will be marked
18212082Seschrock 	 * as tried so we'll proceed to combinatorial reconstruction.
1822789Sahrens 	 */
1823789Sahrens 	unexpected_errors = 1;
18242082Seschrock 	rm->rm_missingdata = 0;
18252082Seschrock 	rm->rm_missingparity = 0;
1826789Sahrens 
18272082Seschrock 	for (c = 0; c < rm->rm_cols; c++) {
18282082Seschrock 		if (rm->rm_col[c].rc_tried)
18292082Seschrock 			continue;
1830789Sahrens 
1831789Sahrens 		zio_vdev_io_redone(zio);
18322082Seschrock 		do {
1833789Sahrens 			rc = &rm->rm_col[c];
1834789Sahrens 			if (rc->rc_tried)
1835789Sahrens 				continue;
1836789Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL,
18372082Seschrock 			    vd->vdev_child[rc->rc_devidx],
1838789Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
18397754SJeff.Bonwick@Sun.COM 			    zio->io_type, zio->io_priority, 0,
1840789Sahrens 			    vdev_raidz_child_done, rc));
18412082Seschrock 		} while (++c < rm->rm_cols);
18425530Sbonwick 
18437754SJeff.Bonwick@Sun.COM 		return;
1844789Sahrens 	}
1845789Sahrens 
1846789Sahrens 	/*
18472082Seschrock 	 * At this point we've attempted to reconstruct the data given the
18482082Seschrock 	 * errors we detected, and we've attempted to read all columns. There
18492082Seschrock 	 * must, therefore, be one or more additional problems -- silent errors
18502082Seschrock 	 * resulting in invalid data rather than explicit I/O errors resulting
185110105Sadam.leventhal@sun.com 	 * in absent data. We check if there is enough additional data to
185210105Sadam.leventhal@sun.com 	 * possibly reconstruct the data and then perform combinatorial
185310105Sadam.leventhal@sun.com 	 * reconstruction over all possible combinations. If that fails,
185410105Sadam.leventhal@sun.com 	 * we're cooked.
1855789Sahrens 	 */
18567754SJeff.Bonwick@Sun.COM 	if (total_errors >= rm->rm_firstdatacol) {
18577754SJeff.Bonwick@Sun.COM 		zio->io_error = vdev_raidz_worst_error(rm);
18587754SJeff.Bonwick@Sun.COM 		/*
18597754SJeff.Bonwick@Sun.COM 		 * If there were exactly as many device errors as parity
18607754SJeff.Bonwick@Sun.COM 		 * columns, yet we couldn't reconstruct the data, then at
18617754SJeff.Bonwick@Sun.COM 		 * least one device must have returned bad data silently.
18627754SJeff.Bonwick@Sun.COM 		 */
18637754SJeff.Bonwick@Sun.COM 		if (total_errors == rm->rm_firstdatacol)
18647754SJeff.Bonwick@Sun.COM 			zio->io_error = zio_worst_error(zio->io_error, ECKSUM);
18652082Seschrock 
186610105Sadam.leventhal@sun.com 	} else if ((code = vdev_raidz_combrec(zio, total_errors,
186710105Sadam.leventhal@sun.com 	    data_errors)) != 0) {
18682082Seschrock 		/*
186910105Sadam.leventhal@sun.com 		 * If we didn't use all the available parity for the
187010105Sadam.leventhal@sun.com 		 * combinatorial reconstruction, verify that the remaining
187110105Sadam.leventhal@sun.com 		 * parity is correct.
18722082Seschrock 		 */
187310105Sadam.leventhal@sun.com 		if (code != (1 << rm->rm_firstdatacol) - 1)
187410105Sadam.leventhal@sun.com 			(void) raidz_parity_verify(zio, rm);
187510105Sadam.leventhal@sun.com 	} else {
187610105Sadam.leventhal@sun.com 		/*
187710105Sadam.leventhal@sun.com 		 * All combinations failed to checksum. Generate checksum
187810105Sadam.leventhal@sun.com 		 * ereports for all children.
187910105Sadam.leventhal@sun.com 		 */
188010105Sadam.leventhal@sun.com 		zio->io_error = ECKSUM;
18812082Seschrock 
188210105Sadam.leventhal@sun.com 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
188310105Sadam.leventhal@sun.com 			for (c = 0; c < rm->rm_cols; c++) {
188410105Sadam.leventhal@sun.com 				rc = &rm->rm_col[c];
188510105Sadam.leventhal@sun.com 				zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
188610105Sadam.leventhal@sun.com 				    zio->io_spa, vd->vdev_child[rc->rc_devidx],
188710105Sadam.leventhal@sun.com 				    zio, rc->rc_offset, rc->rc_size);
18882082Seschrock 			}
18891544Seschrock 		}
18901544Seschrock 	}
1891789Sahrens 
1892789Sahrens done:
1893789Sahrens 	zio_checksum_verified(zio);
1894789Sahrens 
18958241SJeff.Bonwick@Sun.COM 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
1896789Sahrens 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
1897789Sahrens 		/*
1898789Sahrens 		 * Use the good data we have in hand to repair damaged children.
1899789Sahrens 		 */
1900789Sahrens 		for (c = 0; c < rm->rm_cols; c++) {
1901789Sahrens 			rc = &rm->rm_col[c];
19022082Seschrock 			cvd = vd->vdev_child[rc->rc_devidx];
1903789Sahrens 
19041732Sbonwick 			if (rc->rc_error == 0)
19051732Sbonwick 				continue;
19061732Sbonwick 
19077754SJeff.Bonwick@Sun.COM 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
19081732Sbonwick 			    rc->rc_offset, rc->rc_data, rc->rc_size,
19091732Sbonwick 			    ZIO_TYPE_WRITE, zio->io_priority,
19108241SJeff.Bonwick@Sun.COM 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
19118241SJeff.Bonwick@Sun.COM 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
19121732Sbonwick 		}
1913789Sahrens 	}
1914789Sahrens }
1915789Sahrens 
1916789Sahrens static void
1917789Sahrens vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
1918789Sahrens {
19192082Seschrock 	if (faulted > vd->vdev_nparity)
19201544Seschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
19211544Seschrock 		    VDEV_AUX_NO_REPLICAS);
1922789Sahrens 	else if (degraded + faulted != 0)
19231544Seschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
1924789Sahrens 	else
19251544Seschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
1926789Sahrens }
1927789Sahrens 
1928789Sahrens vdev_ops_t vdev_raidz_ops = {
1929789Sahrens 	vdev_raidz_open,
1930789Sahrens 	vdev_raidz_close,
1931789Sahrens 	vdev_raidz_asize,
1932789Sahrens 	vdev_raidz_io_start,
1933789Sahrens 	vdev_raidz_io_done,
1934789Sahrens 	vdev_raidz_state_change,
1935789Sahrens 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
1936789Sahrens 	B_FALSE			/* not a leaf vdev */
1937789Sahrens };
1938