xref: /onnv-gate/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision 10105:17811c723fb4)
1789Sahrens /*
2789Sahrens  * CDDL HEADER START
3789Sahrens  *
4789Sahrens  * The contents of this file are subject to the terms of the
51544Seschrock  * Common Development and Distribution License (the "License").
61544Seschrock  * You may not use this file except in compliance with the License.
7789Sahrens  *
8789Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9789Sahrens  * or http://www.opensolaris.org/os/licensing.
10789Sahrens  * See the License for the specific language governing permissions
11789Sahrens  * and limitations under the License.
12789Sahrens  *
13789Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14789Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15789Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16789Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17789Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18789Sahrens  *
19789Sahrens  * CDDL HEADER END
20789Sahrens  */
212082Seschrock 
22789Sahrens /*
239434SMark.Musante@Sun.COM  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24789Sahrens  * Use is subject to license terms.
25789Sahrens  */
26789Sahrens 
27789Sahrens #include <sys/zfs_context.h>
28789Sahrens #include <sys/spa.h>
29789Sahrens #include <sys/vdev_impl.h>
30789Sahrens #include <sys/zio.h>
31789Sahrens #include <sys/zio_checksum.h>
32789Sahrens #include <sys/fs/zfs.h>
331544Seschrock #include <sys/fm/fs/zfs.h>
34789Sahrens 
35789Sahrens /*
36789Sahrens  * Virtual device vector for RAID-Z.
372082Seschrock  *
38*10105Sadam.leventhal@sun.com  * This vdev supports single, double, and triple parity. For single parity,
39*10105Sadam.leventhal@sun.com  * we use a simple XOR of all the data columns. For double or triple parity,
40*10105Sadam.leventhal@sun.com  * we use a special case of Reed-Solomon coding. This extends the
41*10105Sadam.leventhal@sun.com  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
42*10105Sadam.leventhal@sun.com  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
43*10105Sadam.leventhal@sun.com  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
44*10105Sadam.leventhal@sun.com  * former is also based. The latter is designed to provide higher performance
45*10105Sadam.leventhal@sun.com  * for writes.
46*10105Sadam.leventhal@sun.com  *
47*10105Sadam.leventhal@sun.com  * Note that the Plank paper claimed to support arbitrary N+M, but was then
48*10105Sadam.leventhal@sun.com  * amended six years later identifying a critical flaw that invalidates its
49*10105Sadam.leventhal@sun.com  * claims. Nevertheless, the technique can be adapted to work for up to
50*10105Sadam.leventhal@sun.com  * triple parity. For additional parity, the amendment "Note: Correction to
51*10105Sadam.leventhal@sun.com  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
52*10105Sadam.leventhal@sun.com  * is viable, but the additional complexity means that write performance will
53*10105Sadam.leventhal@sun.com  * suffer.
54*10105Sadam.leventhal@sun.com  *
55*10105Sadam.leventhal@sun.com  * All of the methods above operate on a Galois field, defined over the
56*10105Sadam.leventhal@sun.com  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
57*10105Sadam.leventhal@sun.com  * can be expressed with a single byte. Briefly, the operations on the
58*10105Sadam.leventhal@sun.com  * field are defined as follows:
592082Seschrock  *
602082Seschrock  *   o addition (+) is represented by a bitwise XOR
612082Seschrock  *   o subtraction (-) is therefore identical to addition: A + B = A - B
622082Seschrock  *   o multiplication of A by 2 is defined by the following bitwise expression:
632082Seschrock  *	(A * 2)_7 = A_6
642082Seschrock  *	(A * 2)_6 = A_5
652082Seschrock  *	(A * 2)_5 = A_4
662082Seschrock  *	(A * 2)_4 = A_3 + A_7
672082Seschrock  *	(A * 2)_3 = A_2 + A_7
682082Seschrock  *	(A * 2)_2 = A_1 + A_7
692082Seschrock  *	(A * 2)_1 = A_0
702082Seschrock  *	(A * 2)_0 = A_7
712082Seschrock  *
722082Seschrock  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
73*10105Sadam.leventhal@sun.com  * As an aside, this multiplication is derived from the error correcting
74*10105Sadam.leventhal@sun.com  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
752082Seschrock  *
762082Seschrock  * Observe that any number in the field (except for 0) can be expressed as a
772082Seschrock  * power of 2 -- a generator for the field. We store a table of the powers of
782082Seschrock  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
792082Seschrock  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
80*10105Sadam.leventhal@sun.com  * than field addition). The inverse of a field element A (A^-1) is therefore
81*10105Sadam.leventhal@sun.com  * A ^ (255 - 1) = A^254.
822082Seschrock  *
83*10105Sadam.leventhal@sun.com  * The up-to-three parity columns, P, Q, R over several data columns,
84*10105Sadam.leventhal@sun.com  * D_0, ... D_n-1, can be expressed by field operations:
852082Seschrock  *
862082Seschrock  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
872082Seschrock  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
882082Seschrock  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
89*10105Sadam.leventhal@sun.com  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
90*10105Sadam.leventhal@sun.com  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
912082Seschrock  *
92*10105Sadam.leventhal@sun.com  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
93*10105Sadam.leventhal@sun.com  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
94*10105Sadam.leventhal@sun.com  * independent coefficients. (There are no additional coefficients that have
95*10105Sadam.leventhal@sun.com  * this property which is why the uncorrected Plank method breaks down.)
96*10105Sadam.leventhal@sun.com  *
97*10105Sadam.leventhal@sun.com  * See the reconstruction code below for how P, Q and R can used individually
98*10105Sadam.leventhal@sun.com  * or in concert to recover missing data columns.
99789Sahrens  */
100789Sahrens 
101789Sahrens typedef struct raidz_col {
1022082Seschrock 	uint64_t rc_devidx;		/* child device index for I/O */
1032082Seschrock 	uint64_t rc_offset;		/* device offset */
1042082Seschrock 	uint64_t rc_size;		/* I/O size */
1052082Seschrock 	void *rc_data;			/* I/O data */
1062082Seschrock 	int rc_error;			/* I/O error for this device */
1072082Seschrock 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
1082082Seschrock 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
109789Sahrens } raidz_col_t;
110789Sahrens 
111789Sahrens typedef struct raidz_map {
112*10105Sadam.leventhal@sun.com 	uint64_t rm_cols;		/* Regular column count */
113*10105Sadam.leventhal@sun.com 	uint64_t rm_scols;		/* Count including skipped columns */
1142082Seschrock 	uint64_t rm_bigcols;		/* Number of oversized columns */
1152082Seschrock 	uint64_t rm_asize;		/* Actual total I/O size */
1162082Seschrock 	uint64_t rm_missingdata;	/* Count of missing data devices */
1172082Seschrock 	uint64_t rm_missingparity;	/* Count of missing parity devices */
1182082Seschrock 	uint64_t rm_firstdatacol;	/* First data column/parity count */
119*10105Sadam.leventhal@sun.com 	uint64_t rm_skipped;		/* Skipped sectors for padding */
1202082Seschrock 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
121789Sahrens } raidz_map_t;
122789Sahrens 
1232082Seschrock #define	VDEV_RAIDZ_P		0
1242082Seschrock #define	VDEV_RAIDZ_Q		1
125*10105Sadam.leventhal@sun.com #define	VDEV_RAIDZ_R		2
126*10105Sadam.leventhal@sun.com #define	VDEV_RAIDZ_MAXPARITY	3
1272082Seschrock 
128*10105Sadam.leventhal@sun.com #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
129*10105Sadam.leventhal@sun.com #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
1302082Seschrock 
131*10105Sadam.leventhal@sun.com /*
132*10105Sadam.leventhal@sun.com  * We provide a mechanism to perform the field multiplication operation on a
133*10105Sadam.leventhal@sun.com  * 64-bit value all at once rather than a byte at a time. This works by
134*10105Sadam.leventhal@sun.com  * creating a mask from the top bit in each byte and using that to
135*10105Sadam.leventhal@sun.com  * conditionally apply the XOR of 0x1d.
136*10105Sadam.leventhal@sun.com  */
137*10105Sadam.leventhal@sun.com #define	VDEV_RAIDZ_64MUL_2(x, mask) \
138*10105Sadam.leventhal@sun.com { \
139*10105Sadam.leventhal@sun.com 	(mask) = (x) & 0x8080808080808080ULL; \
140*10105Sadam.leventhal@sun.com 	(mask) = ((mask) << 1) - ((mask) >> 7); \
141*10105Sadam.leventhal@sun.com 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
142*10105Sadam.leventhal@sun.com 	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
143*10105Sadam.leventhal@sun.com }
144*10105Sadam.leventhal@sun.com 
145*10105Sadam.leventhal@sun.com #define	VDEV_RAIDZ_64MUL_4(x, mask) \
146*10105Sadam.leventhal@sun.com { \
147*10105Sadam.leventhal@sun.com 	VDEV_RAIDZ_64MUL_2((x), mask); \
148*10105Sadam.leventhal@sun.com 	VDEV_RAIDZ_64MUL_2((x), mask); \
149*10105Sadam.leventhal@sun.com }
150*10105Sadam.leventhal@sun.com 
151*10105Sadam.leventhal@sun.com /*
152*10105Sadam.leventhal@sun.com  * Force reconstruction to use the general purpose method.
153*10105Sadam.leventhal@sun.com  */
154*10105Sadam.leventhal@sun.com int vdev_raidz_default_to_general;
1552082Seschrock 
1562082Seschrock /*
1572082Seschrock  * These two tables represent powers and logs of 2 in the Galois field defined
1582082Seschrock  * above. These values were computed by repeatedly multiplying by 2 as above.
1592082Seschrock  */
1602082Seschrock static const uint8_t vdev_raidz_pow2[256] = {
1612082Seschrock 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
1622082Seschrock 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
1632082Seschrock 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
1642082Seschrock 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
1652082Seschrock 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
1662082Seschrock 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
1672082Seschrock 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
1682082Seschrock 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
1692082Seschrock 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
1702082Seschrock 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
1712082Seschrock 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
1722082Seschrock 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
1732082Seschrock 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
1742082Seschrock 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
1752082Seschrock 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
1762082Seschrock 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
1772082Seschrock 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
1782082Seschrock 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
1792082Seschrock 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
1802082Seschrock 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
1812082Seschrock 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
1822082Seschrock 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
1832082Seschrock 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
1842082Seschrock 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
1852082Seschrock 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
1862082Seschrock 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
1872082Seschrock 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
1882082Seschrock 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
1892082Seschrock 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
1902082Seschrock 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
1912082Seschrock 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
1922082Seschrock 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
1932082Seschrock };
1942082Seschrock static const uint8_t vdev_raidz_log2[256] = {
1952082Seschrock 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
1962082Seschrock 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
1972082Seschrock 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
1982082Seschrock 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
1992082Seschrock 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
2002082Seschrock 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
2012082Seschrock 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
2022082Seschrock 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
2032082Seschrock 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
2042082Seschrock 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
2052082Seschrock 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
2062082Seschrock 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
2072082Seschrock 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
2082082Seschrock 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
2092082Seschrock 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
2102082Seschrock 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
2112082Seschrock 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
2122082Seschrock 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
2132082Seschrock 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
2142082Seschrock 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
2152082Seschrock 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
2162082Seschrock 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
2172082Seschrock 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
2182082Seschrock 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
2192082Seschrock 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
2202082Seschrock 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
2212082Seschrock 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
2222082Seschrock 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
2232082Seschrock 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
2242082Seschrock 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
2252082Seschrock 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
2262082Seschrock 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
2272082Seschrock };
2282082Seschrock 
2292082Seschrock /*
2302082Seschrock  * Multiply a given number by 2 raised to the given power.
2312082Seschrock  */
2322082Seschrock static uint8_t
2332082Seschrock vdev_raidz_exp2(uint_t a, int exp)
2342082Seschrock {
2352082Seschrock 	if (a == 0)
2362082Seschrock 		return (0);
2372082Seschrock 
2382082Seschrock 	ASSERT(exp >= 0);
2392082Seschrock 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
2402082Seschrock 
2412082Seschrock 	exp += vdev_raidz_log2[a];
2422082Seschrock 	if (exp > 255)
2432082Seschrock 		exp -= 255;
2442082Seschrock 
2452082Seschrock 	return (vdev_raidz_pow2[exp]);
2462082Seschrock }
2472082Seschrock 
2487754SJeff.Bonwick@Sun.COM static void
2497754SJeff.Bonwick@Sun.COM vdev_raidz_map_free(zio_t *zio)
2507754SJeff.Bonwick@Sun.COM {
2517754SJeff.Bonwick@Sun.COM 	raidz_map_t *rm = zio->io_vsd;
2527754SJeff.Bonwick@Sun.COM 	int c;
2537754SJeff.Bonwick@Sun.COM 
2547754SJeff.Bonwick@Sun.COM 	for (c = 0; c < rm->rm_firstdatacol; c++)
2557754SJeff.Bonwick@Sun.COM 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
2567754SJeff.Bonwick@Sun.COM 
257*10105Sadam.leventhal@sun.com 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
2587754SJeff.Bonwick@Sun.COM }
2597754SJeff.Bonwick@Sun.COM 
260789Sahrens static raidz_map_t *
2612082Seschrock vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
2622082Seschrock     uint64_t nparity)
263789Sahrens {
264789Sahrens 	raidz_map_t *rm;
265789Sahrens 	uint64_t b = zio->io_offset >> unit_shift;
266789Sahrens 	uint64_t s = zio->io_size >> unit_shift;
267789Sahrens 	uint64_t f = b % dcols;
268789Sahrens 	uint64_t o = (b / dcols) << unit_shift;
269*10105Sadam.leventhal@sun.com 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
270789Sahrens 
2712082Seschrock 	q = s / (dcols - nparity);
2722082Seschrock 	r = s - q * (dcols - nparity);
2732082Seschrock 	bc = (r == 0 ? 0 : r + nparity);
274*10105Sadam.leventhal@sun.com 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
275789Sahrens 
276*10105Sadam.leventhal@sun.com 	if (q == 0) {
277*10105Sadam.leventhal@sun.com 		acols = bc;
278*10105Sadam.leventhal@sun.com 		scols = MIN(dcols, roundup(bc, nparity + 1));
279*10105Sadam.leventhal@sun.com 	} else {
280*10105Sadam.leventhal@sun.com 		acols = dcols;
281*10105Sadam.leventhal@sun.com 		scols = dcols;
282*10105Sadam.leventhal@sun.com 	}
283789Sahrens 
284*10105Sadam.leventhal@sun.com 	ASSERT3U(acols, <=, scols);
285*10105Sadam.leventhal@sun.com 
286*10105Sadam.leventhal@sun.com 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
287789Sahrens 
288789Sahrens 	rm->rm_cols = acols;
289*10105Sadam.leventhal@sun.com 	rm->rm_scols = scols;
290789Sahrens 	rm->rm_bigcols = bc;
2912082Seschrock 	rm->rm_missingdata = 0;
2922082Seschrock 	rm->rm_missingparity = 0;
2932082Seschrock 	rm->rm_firstdatacol = nparity;
294789Sahrens 
295*10105Sadam.leventhal@sun.com 	asize = 0;
296*10105Sadam.leventhal@sun.com 
297*10105Sadam.leventhal@sun.com 	for (c = 0; c < scols; c++) {
298789Sahrens 		col = f + c;
299789Sahrens 		coff = o;
300789Sahrens 		if (col >= dcols) {
301789Sahrens 			col -= dcols;
302789Sahrens 			coff += 1ULL << unit_shift;
303789Sahrens 		}
3042082Seschrock 		rm->rm_col[c].rc_devidx = col;
305789Sahrens 		rm->rm_col[c].rc_offset = coff;
306789Sahrens 		rm->rm_col[c].rc_data = NULL;
307789Sahrens 		rm->rm_col[c].rc_error = 0;
308789Sahrens 		rm->rm_col[c].rc_tried = 0;
309789Sahrens 		rm->rm_col[c].rc_skipped = 0;
310*10105Sadam.leventhal@sun.com 
311*10105Sadam.leventhal@sun.com 		if (c >= acols)
312*10105Sadam.leventhal@sun.com 			rm->rm_col[c].rc_size = 0;
313*10105Sadam.leventhal@sun.com 		else if (c < bc)
314*10105Sadam.leventhal@sun.com 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
315*10105Sadam.leventhal@sun.com 		else
316*10105Sadam.leventhal@sun.com 			rm->rm_col[c].rc_size = q << unit_shift;
317*10105Sadam.leventhal@sun.com 
318*10105Sadam.leventhal@sun.com 		asize += rm->rm_col[c].rc_size;
319789Sahrens 	}
320789Sahrens 
321*10105Sadam.leventhal@sun.com 	ASSERT3U(asize, ==, tot << unit_shift);
322*10105Sadam.leventhal@sun.com 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
323*10105Sadam.leventhal@sun.com 	rm->rm_skipped = roundup(tot, nparity + 1) - tot;
324*10105Sadam.leventhal@sun.com 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_skipped << unit_shift);
325*10105Sadam.leventhal@sun.com 	ASSERT3U(rm->rm_skipped, <=, nparity);
326789Sahrens 
327789Sahrens 	for (c = 0; c < rm->rm_firstdatacol; c++)
328789Sahrens 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
329789Sahrens 
330789Sahrens 	rm->rm_col[c].rc_data = zio->io_data;
331789Sahrens 
332789Sahrens 	for (c = c + 1; c < acols; c++)
333789Sahrens 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
334789Sahrens 		    rm->rm_col[c - 1].rc_size;
335789Sahrens 
3361133Seschrock 	/*
3372082Seschrock 	 * If all data stored spans all columns, there's a danger that parity
3382082Seschrock 	 * will always be on the same device and, since parity isn't read
3392082Seschrock 	 * during normal operation, that that device's I/O bandwidth won't be
3402082Seschrock 	 * used effectively. We therefore switch the parity every 1MB.
3412082Seschrock 	 *
3422082Seschrock 	 * ... at least that was, ostensibly, the theory. As a practical
3432082Seschrock 	 * matter unless we juggle the parity between all devices evenly, we
3442082Seschrock 	 * won't see any benefit. Further, occasional writes that aren't a
3452082Seschrock 	 * multiple of the LCM of the number of children and the minimum
3462082Seschrock 	 * stripe width are sufficient to avoid pessimal behavior.
3472082Seschrock 	 * Unfortunately, this decision created an implicit on-disk format
3483456Sahl 	 * requirement that we need to support for all eternity, but only
3493456Sahl 	 * for single-parity RAID-Z.
3501133Seschrock 	 */
3511133Seschrock 	ASSERT(rm->rm_cols >= 2);
3521133Seschrock 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
353789Sahrens 
3542082Seschrock 	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
3552082Seschrock 		devidx = rm->rm_col[0].rc_devidx;
3561133Seschrock 		o = rm->rm_col[0].rc_offset;
3572082Seschrock 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
3581133Seschrock 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
3592082Seschrock 		rm->rm_col[1].rc_devidx = devidx;
3601133Seschrock 		rm->rm_col[1].rc_offset = o;
361789Sahrens 	}
362789Sahrens 
363789Sahrens 	zio->io_vsd = rm;
3647754SJeff.Bonwick@Sun.COM 	zio->io_vsd_free = vdev_raidz_map_free;
365789Sahrens 	return (rm);
366789Sahrens }
367789Sahrens 
368789Sahrens static void
3692082Seschrock vdev_raidz_generate_parity_p(raidz_map_t *rm)
3702082Seschrock {
3712082Seschrock 	uint64_t *p, *src, pcount, ccount, i;
3722082Seschrock 	int c;
3732082Seschrock 
3742082Seschrock 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
3752082Seschrock 
3762082Seschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
3772082Seschrock 		src = rm->rm_col[c].rc_data;
3782082Seschrock 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
3792082Seschrock 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
3802082Seschrock 
3812082Seschrock 		if (c == rm->rm_firstdatacol) {
3822082Seschrock 			ASSERT(ccount == pcount);
383*10105Sadam.leventhal@sun.com 			for (i = 0; i < ccount; i++, src++, p++) {
3842082Seschrock 				*p = *src;
3852082Seschrock 			}
3862082Seschrock 		} else {
3872082Seschrock 			ASSERT(ccount <= pcount);
388*10105Sadam.leventhal@sun.com 			for (i = 0; i < ccount; i++, src++, p++) {
3892082Seschrock 				*p ^= *src;
3902082Seschrock 			}
3912082Seschrock 		}
3922082Seschrock 	}
3932082Seschrock }
3942082Seschrock 
3952082Seschrock static void
3962082Seschrock vdev_raidz_generate_parity_pq(raidz_map_t *rm)
397789Sahrens {
398*10105Sadam.leventhal@sun.com 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
3992082Seschrock 	int c;
4002082Seschrock 
401*10105Sadam.leventhal@sun.com 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
4022082Seschrock 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
4032082Seschrock 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
4042082Seschrock 
4052082Seschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
4062082Seschrock 		src = rm->rm_col[c].rc_data;
4072082Seschrock 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
4082082Seschrock 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
409*10105Sadam.leventhal@sun.com 
410*10105Sadam.leventhal@sun.com 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
4112082Seschrock 
4122082Seschrock 		if (c == rm->rm_firstdatacol) {
413*10105Sadam.leventhal@sun.com 			ASSERT(ccnt == pcnt || ccnt == 0);
414*10105Sadam.leventhal@sun.com 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
4152082Seschrock 				*p = *src;
416*10105Sadam.leventhal@sun.com 				*q = *src;
4172082Seschrock 			}
418*10105Sadam.leventhal@sun.com 			for (; i < pcnt; i++, src++, p++, q++) {
419*10105Sadam.leventhal@sun.com 				*p = 0;
4202082Seschrock 				*q = 0;
4212082Seschrock 			}
4222082Seschrock 		} else {
423*10105Sadam.leventhal@sun.com 			ASSERT(ccnt <= pcnt);
424789Sahrens 
4252082Seschrock 			/*
426*10105Sadam.leventhal@sun.com 			 * Apply the algorithm described above by multiplying
427*10105Sadam.leventhal@sun.com 			 * the previous result and adding in the new value.
4282082Seschrock 			 */
429*10105Sadam.leventhal@sun.com 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
430*10105Sadam.leventhal@sun.com 				*p ^= *src;
431*10105Sadam.leventhal@sun.com 
432*10105Sadam.leventhal@sun.com 				VDEV_RAIDZ_64MUL_2(*q, mask);
4332082Seschrock 				*q ^= *src;
4342082Seschrock 			}
4352082Seschrock 
4362082Seschrock 			/*
4372082Seschrock 			 * Treat short columns as though they are full of 0s.
438*10105Sadam.leventhal@sun.com 			 * Note that there's therefore nothing needed for P.
4392082Seschrock 			 */
440*10105Sadam.leventhal@sun.com 			for (; i < pcnt; i++, q++) {
441*10105Sadam.leventhal@sun.com 				VDEV_RAIDZ_64MUL_2(*q, mask);
4422082Seschrock 			}
4432082Seschrock 		}
4442082Seschrock 	}
4452082Seschrock }
4462082Seschrock 
4472082Seschrock static void
448*10105Sadam.leventhal@sun.com vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
449*10105Sadam.leventhal@sun.com {
450*10105Sadam.leventhal@sun.com 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
451*10105Sadam.leventhal@sun.com 	int c;
452*10105Sadam.leventhal@sun.com 
453*10105Sadam.leventhal@sun.com 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
454*10105Sadam.leventhal@sun.com 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
455*10105Sadam.leventhal@sun.com 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
456*10105Sadam.leventhal@sun.com 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
457*10105Sadam.leventhal@sun.com 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
458*10105Sadam.leventhal@sun.com 
459*10105Sadam.leventhal@sun.com 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
460*10105Sadam.leventhal@sun.com 		src = rm->rm_col[c].rc_data;
461*10105Sadam.leventhal@sun.com 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
462*10105Sadam.leventhal@sun.com 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
463*10105Sadam.leventhal@sun.com 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
464*10105Sadam.leventhal@sun.com 
465*10105Sadam.leventhal@sun.com 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
466*10105Sadam.leventhal@sun.com 
467*10105Sadam.leventhal@sun.com 		if (c == rm->rm_firstdatacol) {
468*10105Sadam.leventhal@sun.com 			ASSERT(ccnt == pcnt || ccnt == 0);
469*10105Sadam.leventhal@sun.com 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
470*10105Sadam.leventhal@sun.com 				*p = *src;
471*10105Sadam.leventhal@sun.com 				*q = *src;
472*10105Sadam.leventhal@sun.com 				*r = *src;
473*10105Sadam.leventhal@sun.com 			}
474*10105Sadam.leventhal@sun.com 			for (; i < pcnt; i++, src++, p++, q++, r++) {
475*10105Sadam.leventhal@sun.com 				*p = 0;
476*10105Sadam.leventhal@sun.com 				*q = 0;
477*10105Sadam.leventhal@sun.com 				*r = 0;
478*10105Sadam.leventhal@sun.com 			}
479*10105Sadam.leventhal@sun.com 		} else {
480*10105Sadam.leventhal@sun.com 			ASSERT(ccnt <= pcnt);
481*10105Sadam.leventhal@sun.com 
482*10105Sadam.leventhal@sun.com 			/*
483*10105Sadam.leventhal@sun.com 			 * Apply the algorithm described above by multiplying
484*10105Sadam.leventhal@sun.com 			 * the previous result and adding in the new value.
485*10105Sadam.leventhal@sun.com 			 */
486*10105Sadam.leventhal@sun.com 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
487*10105Sadam.leventhal@sun.com 				*p ^= *src;
488*10105Sadam.leventhal@sun.com 
489*10105Sadam.leventhal@sun.com 				VDEV_RAIDZ_64MUL_2(*q, mask);
490*10105Sadam.leventhal@sun.com 				*q ^= *src;
491*10105Sadam.leventhal@sun.com 
492*10105Sadam.leventhal@sun.com 				VDEV_RAIDZ_64MUL_4(*r, mask);
493*10105Sadam.leventhal@sun.com 				*r ^= *src;
494*10105Sadam.leventhal@sun.com 			}
495*10105Sadam.leventhal@sun.com 
496*10105Sadam.leventhal@sun.com 			/*
497*10105Sadam.leventhal@sun.com 			 * Treat short columns as though they are full of 0s.
498*10105Sadam.leventhal@sun.com 			 * Note that there's therefore nothing needed for P.
499*10105Sadam.leventhal@sun.com 			 */
500*10105Sadam.leventhal@sun.com 			for (; i < pcnt; i++, q++, r++) {
501*10105Sadam.leventhal@sun.com 				VDEV_RAIDZ_64MUL_2(*q, mask);
502*10105Sadam.leventhal@sun.com 				VDEV_RAIDZ_64MUL_4(*r, mask);
503*10105Sadam.leventhal@sun.com 			}
504*10105Sadam.leventhal@sun.com 		}
505*10105Sadam.leventhal@sun.com 	}
506*10105Sadam.leventhal@sun.com }
507*10105Sadam.leventhal@sun.com 
508*10105Sadam.leventhal@sun.com /*
509*10105Sadam.leventhal@sun.com  * Generate RAID parity in the first virtual columns according to the number of
510*10105Sadam.leventhal@sun.com  * parity columns available.
511*10105Sadam.leventhal@sun.com  */
512*10105Sadam.leventhal@sun.com static void
513*10105Sadam.leventhal@sun.com vdev_raidz_generate_parity(raidz_map_t *rm)
514*10105Sadam.leventhal@sun.com {
515*10105Sadam.leventhal@sun.com 	switch (rm->rm_firstdatacol) {
516*10105Sadam.leventhal@sun.com 	case 1:
517*10105Sadam.leventhal@sun.com 		vdev_raidz_generate_parity_p(rm);
518*10105Sadam.leventhal@sun.com 		break;
519*10105Sadam.leventhal@sun.com 	case 2:
520*10105Sadam.leventhal@sun.com 		vdev_raidz_generate_parity_pq(rm);
521*10105Sadam.leventhal@sun.com 		break;
522*10105Sadam.leventhal@sun.com 	case 3:
523*10105Sadam.leventhal@sun.com 		vdev_raidz_generate_parity_pqr(rm);
524*10105Sadam.leventhal@sun.com 		break;
525*10105Sadam.leventhal@sun.com 	default:
526*10105Sadam.leventhal@sun.com 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
527*10105Sadam.leventhal@sun.com 	}
528*10105Sadam.leventhal@sun.com }
529*10105Sadam.leventhal@sun.com 
530*10105Sadam.leventhal@sun.com static int
531*10105Sadam.leventhal@sun.com vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
5322082Seschrock {
5332082Seschrock 	uint64_t *dst, *src, xcount, ccount, count, i;
534*10105Sadam.leventhal@sun.com 	int x = tgts[0];
5352082Seschrock 	int c;
5362082Seschrock 
537*10105Sadam.leventhal@sun.com 	ASSERT(ntgts == 1);
538*10105Sadam.leventhal@sun.com 	ASSERT(x >= rm->rm_firstdatacol);
539*10105Sadam.leventhal@sun.com 	ASSERT(x < rm->rm_cols);
540*10105Sadam.leventhal@sun.com 
5412082Seschrock 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
5422082Seschrock 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
5432082Seschrock 	ASSERT(xcount > 0);
5442082Seschrock 
5452082Seschrock 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
5462082Seschrock 	dst = rm->rm_col[x].rc_data;
5472082Seschrock 	for (i = 0; i < xcount; i++, dst++, src++) {
5482082Seschrock 		*dst = *src;
5492082Seschrock 	}
5502082Seschrock 
5512082Seschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
552789Sahrens 		src = rm->rm_col[c].rc_data;
553789Sahrens 		dst = rm->rm_col[x].rc_data;
5542082Seschrock 
5552082Seschrock 		if (c == x)
5562082Seschrock 			continue;
5572082Seschrock 
5582082Seschrock 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
5592082Seschrock 		count = MIN(ccount, xcount);
5602082Seschrock 
5612082Seschrock 		for (i = 0; i < count; i++, dst++, src++) {
5622082Seschrock 			*dst ^= *src;
563789Sahrens 		}
564789Sahrens 	}
565*10105Sadam.leventhal@sun.com 
566*10105Sadam.leventhal@sun.com 	return (1 << VDEV_RAIDZ_P);
567789Sahrens }
568789Sahrens 
569*10105Sadam.leventhal@sun.com static int
570*10105Sadam.leventhal@sun.com vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
5712082Seschrock {
5722082Seschrock 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
5732082Seschrock 	uint8_t *b;
574*10105Sadam.leventhal@sun.com 	int x = tgts[0];
5752082Seschrock 	int c, j, exp;
5762082Seschrock 
577*10105Sadam.leventhal@sun.com 	ASSERT(ntgts == 1);
578*10105Sadam.leventhal@sun.com 
5792082Seschrock 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
5802082Seschrock 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
5812082Seschrock 
5822082Seschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
5832082Seschrock 		src = rm->rm_col[c].rc_data;
5842082Seschrock 		dst = rm->rm_col[x].rc_data;
5852082Seschrock 
5862082Seschrock 		if (c == x)
5872082Seschrock 			ccount = 0;
5882082Seschrock 		else
5892082Seschrock 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
5902082Seschrock 
5912082Seschrock 		count = MIN(ccount, xcount);
5922082Seschrock 
5932082Seschrock 		if (c == rm->rm_firstdatacol) {
5942082Seschrock 			for (i = 0; i < count; i++, dst++, src++) {
5952082Seschrock 				*dst = *src;
5962082Seschrock 			}
5972082Seschrock 			for (; i < xcount; i++, dst++) {
5982082Seschrock 				*dst = 0;
5992082Seschrock 			}
6002082Seschrock 
6012082Seschrock 		} else {
6022082Seschrock 			for (i = 0; i < count; i++, dst++, src++) {
603*10105Sadam.leventhal@sun.com 				VDEV_RAIDZ_64MUL_2(*dst, mask);
6042082Seschrock 				*dst ^= *src;
6052082Seschrock 			}
6062082Seschrock 
6072082Seschrock 			for (; i < xcount; i++, dst++) {
608*10105Sadam.leventhal@sun.com 				VDEV_RAIDZ_64MUL_2(*dst, mask);
6092082Seschrock 			}
6102082Seschrock 		}
6112082Seschrock 	}
6122082Seschrock 
6132082Seschrock 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
6142082Seschrock 	dst = rm->rm_col[x].rc_data;
6152082Seschrock 	exp = 255 - (rm->rm_cols - 1 - x);
6162082Seschrock 
6172082Seschrock 	for (i = 0; i < xcount; i++, dst++, src++) {
6182082Seschrock 		*dst ^= *src;
6192082Seschrock 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
6202082Seschrock 			*b = vdev_raidz_exp2(*b, exp);
6212082Seschrock 		}
6222082Seschrock 	}
623*10105Sadam.leventhal@sun.com 
624*10105Sadam.leventhal@sun.com 	return (1 << VDEV_RAIDZ_Q);
6252082Seschrock }
6262082Seschrock 
627*10105Sadam.leventhal@sun.com static int
628*10105Sadam.leventhal@sun.com vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
6292082Seschrock {
6302082Seschrock 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
6312082Seschrock 	void *pdata, *qdata;
6322082Seschrock 	uint64_t xsize, ysize, i;
633*10105Sadam.leventhal@sun.com 	int x = tgts[0];
634*10105Sadam.leventhal@sun.com 	int y = tgts[1];
6352082Seschrock 
636*10105Sadam.leventhal@sun.com 	ASSERT(ntgts == 2);
6372082Seschrock 	ASSERT(x < y);
6382082Seschrock 	ASSERT(x >= rm->rm_firstdatacol);
6392082Seschrock 	ASSERT(y < rm->rm_cols);
6402082Seschrock 
6412082Seschrock 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
6422082Seschrock 
6432082Seschrock 	/*
6442082Seschrock 	 * Move the parity data aside -- we're going to compute parity as
6452082Seschrock 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
6462082Seschrock 	 * reuse the parity generation mechanism without trashing the actual
6472082Seschrock 	 * parity so we make those columns appear to be full of zeros by
6482082Seschrock 	 * setting their lengths to zero.
6492082Seschrock 	 */
6502082Seschrock 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
6512082Seschrock 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
6522082Seschrock 	xsize = rm->rm_col[x].rc_size;
6532082Seschrock 	ysize = rm->rm_col[y].rc_size;
6542082Seschrock 
6552082Seschrock 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
6562082Seschrock 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
6572082Seschrock 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
6582082Seschrock 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
6592082Seschrock 	rm->rm_col[x].rc_size = 0;
6602082Seschrock 	rm->rm_col[y].rc_size = 0;
6612082Seschrock 
6622082Seschrock 	vdev_raidz_generate_parity_pq(rm);
6632082Seschrock 
6642082Seschrock 	rm->rm_col[x].rc_size = xsize;
6652082Seschrock 	rm->rm_col[y].rc_size = ysize;
6662082Seschrock 
6672082Seschrock 	p = pdata;
6682082Seschrock 	q = qdata;
6692082Seschrock 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
6702082Seschrock 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
6712082Seschrock 	xd = rm->rm_col[x].rc_data;
6722082Seschrock 	yd = rm->rm_col[y].rc_data;
6732082Seschrock 
6742082Seschrock 	/*
6752082Seschrock 	 * We now have:
6762082Seschrock 	 *	Pxy = P + D_x + D_y
6772082Seschrock 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
6782082Seschrock 	 *
6792082Seschrock 	 * We can then solve for D_x:
6802082Seschrock 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
6812082Seschrock 	 * where
6822082Seschrock 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
6832082Seschrock 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
6842082Seschrock 	 *
6852082Seschrock 	 * With D_x in hand, we can easily solve for D_y:
6862082Seschrock 	 *	D_y = P + Pxy + D_x
6872082Seschrock 	 */
6882082Seschrock 
6892082Seschrock 	a = vdev_raidz_pow2[255 + x - y];
6902082Seschrock 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
6912082Seschrock 	tmp = 255 - vdev_raidz_log2[a ^ 1];
6922082Seschrock 
6932082Seschrock 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
6942082Seschrock 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
6952082Seschrock 
6962082Seschrock 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
6972082Seschrock 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
6982082Seschrock 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
6992082Seschrock 
7002082Seschrock 		if (i < ysize)
7012082Seschrock 			*yd = *p ^ *pxy ^ *xd;
7022082Seschrock 	}
7032082Seschrock 
7042082Seschrock 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
7052082Seschrock 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
7062082Seschrock 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
7072082Seschrock 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
7082082Seschrock 
7092082Seschrock 	/*
7102082Seschrock 	 * Restore the saved parity data.
7112082Seschrock 	 */
7122082Seschrock 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
7132082Seschrock 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
714*10105Sadam.leventhal@sun.com 
715*10105Sadam.leventhal@sun.com 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
716*10105Sadam.leventhal@sun.com }
717*10105Sadam.leventhal@sun.com 
718*10105Sadam.leventhal@sun.com /* BEGIN CSTYLED */
719*10105Sadam.leventhal@sun.com /*
720*10105Sadam.leventhal@sun.com  * In the general case of reconstruction, we must solve the system of linear
721*10105Sadam.leventhal@sun.com  * equations defined by the coeffecients used to generate parity as well as
722*10105Sadam.leventhal@sun.com  * the contents of the data and parity disks. This can be expressed with
723*10105Sadam.leventhal@sun.com  * vectors for the original data (D) and the actual data (d) and parity (p)
724*10105Sadam.leventhal@sun.com  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
725*10105Sadam.leventhal@sun.com  *
726*10105Sadam.leventhal@sun.com  *            __   __                     __     __
727*10105Sadam.leventhal@sun.com  *            |     |         __     __   |  p_0  |
728*10105Sadam.leventhal@sun.com  *            |  V  |         |  D_0  |   | p_m-1 |
729*10105Sadam.leventhal@sun.com  *            |     |    x    |   :   | = |  d_0  |
730*10105Sadam.leventhal@sun.com  *            |  I  |         | D_n-1 |   |   :   |
731*10105Sadam.leventhal@sun.com  *            |     |         ~~     ~~   | d_n-1 |
732*10105Sadam.leventhal@sun.com  *            ~~   ~~                     ~~     ~~
733*10105Sadam.leventhal@sun.com  *
734*10105Sadam.leventhal@sun.com  * I is simply a square identity matrix of size n, and V is a vandermonde
735*10105Sadam.leventhal@sun.com  * matrix defined by the coeffecients we chose for the various parity columns
736*10105Sadam.leventhal@sun.com  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
737*10105Sadam.leventhal@sun.com  * computation as well as linear separability.
738*10105Sadam.leventhal@sun.com  *
739*10105Sadam.leventhal@sun.com  *      __               __               __     __
740*10105Sadam.leventhal@sun.com  *      |   1   ..  1 1 1 |               |  p_0  |
741*10105Sadam.leventhal@sun.com  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
742*10105Sadam.leventhal@sun.com  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
743*10105Sadam.leventhal@sun.com  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
744*10105Sadam.leventhal@sun.com  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
745*10105Sadam.leventhal@sun.com  *      |   :       : : : |   |   :   |   |  d_2  |
746*10105Sadam.leventhal@sun.com  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
747*10105Sadam.leventhal@sun.com  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
748*10105Sadam.leventhal@sun.com  *      |   0   ..  0 0 1 |               | d_n-1 |
749*10105Sadam.leventhal@sun.com  *      ~~               ~~               ~~     ~~
750*10105Sadam.leventhal@sun.com  *
751*10105Sadam.leventhal@sun.com  * Note that I, V, d, and p are known. To compute D, we must invert the
752*10105Sadam.leventhal@sun.com  * matrix and use the known data and parity values to reconstruct the unknown
753*10105Sadam.leventhal@sun.com  * data values. We begin by removing the rows in V|I and d|p that correspond
754*10105Sadam.leventhal@sun.com  * to failed or missing columns; we then make V|I square (n x n) and d|p
755*10105Sadam.leventhal@sun.com  * sized n by removing rows corresponding to unused parity from the bottom up
756*10105Sadam.leventhal@sun.com  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
757*10105Sadam.leventhal@sun.com  * using Gauss-Jordan elimination. In the example below we use m=3 parity
758*10105Sadam.leventhal@sun.com  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
759*10105Sadam.leventhal@sun.com  *           __                               __
760*10105Sadam.leventhal@sun.com  *           |  1   1   1   1   1   1   1   1  |
761*10105Sadam.leventhal@sun.com  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
762*10105Sadam.leventhal@sun.com  *           |  19 205 116  29  64  16  4   1  |      / /
763*10105Sadam.leventhal@sun.com  *           |  1   0   0   0   0   0   0   0  |     / /
764*10105Sadam.leventhal@sun.com  *           |  0   1   0   0   0   0   0   0  | <--' /
765*10105Sadam.leventhal@sun.com  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
766*10105Sadam.leventhal@sun.com  *           |  0   0   0   1   0   0   0   0  |
767*10105Sadam.leventhal@sun.com  *           |  0   0   0   0   1   0   0   0  |
768*10105Sadam.leventhal@sun.com  *           |  0   0   0   0   0   1   0   0  |
769*10105Sadam.leventhal@sun.com  *           |  0   0   0   0   0   0   1   0  |
770*10105Sadam.leventhal@sun.com  *           |  0   0   0   0   0   0   0   1  |
771*10105Sadam.leventhal@sun.com  *           ~~                               ~~
772*10105Sadam.leventhal@sun.com  *           __                               __
773*10105Sadam.leventhal@sun.com  *           |  1   1   1   1   1   1   1   1  |
774*10105Sadam.leventhal@sun.com  *           | 128  64  32  16  8   4   2   1  |
775*10105Sadam.leventhal@sun.com  *           |  19 205 116  29  64  16  4   1  |
776*10105Sadam.leventhal@sun.com  *           |  1   0   0   0   0   0   0   0  |
777*10105Sadam.leventhal@sun.com  *           |  0   1   0   0   0   0   0   0  |
778*10105Sadam.leventhal@sun.com  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
779*10105Sadam.leventhal@sun.com  *           |  0   0   0   1   0   0   0   0  |
780*10105Sadam.leventhal@sun.com  *           |  0   0   0   0   1   0   0   0  |
781*10105Sadam.leventhal@sun.com  *           |  0   0   0   0   0   1   0   0  |
782*10105Sadam.leventhal@sun.com  *           |  0   0   0   0   0   0   1   0  |
783*10105Sadam.leventhal@sun.com  *           |  0   0   0   0   0   0   0   1  |
784*10105Sadam.leventhal@sun.com  *           ~~                               ~~
785*10105Sadam.leventhal@sun.com  *
786*10105Sadam.leventhal@sun.com  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
787*10105Sadam.leventhal@sun.com  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
788*10105Sadam.leventhal@sun.com  * matrix is not singular.
789*10105Sadam.leventhal@sun.com  * __                                                                 __
790*10105Sadam.leventhal@sun.com  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
791*10105Sadam.leventhal@sun.com  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
792*10105Sadam.leventhal@sun.com  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
793*10105Sadam.leventhal@sun.com  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
794*10105Sadam.leventhal@sun.com  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
795*10105Sadam.leventhal@sun.com  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
796*10105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
797*10105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
798*10105Sadam.leventhal@sun.com  * ~~                                                                 ~~
799*10105Sadam.leventhal@sun.com  * __                                                                 __
800*10105Sadam.leventhal@sun.com  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
801*10105Sadam.leventhal@sun.com  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
802*10105Sadam.leventhal@sun.com  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
803*10105Sadam.leventhal@sun.com  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
804*10105Sadam.leventhal@sun.com  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
805*10105Sadam.leventhal@sun.com  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
806*10105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
807*10105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
808*10105Sadam.leventhal@sun.com  * ~~                                                                 ~~
809*10105Sadam.leventhal@sun.com  * __                                                                 __
810*10105Sadam.leventhal@sun.com  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
811*10105Sadam.leventhal@sun.com  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
812*10105Sadam.leventhal@sun.com  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
813*10105Sadam.leventhal@sun.com  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
814*10105Sadam.leventhal@sun.com  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
815*10105Sadam.leventhal@sun.com  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
816*10105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
817*10105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
818*10105Sadam.leventhal@sun.com  * ~~                                                                 ~~
819*10105Sadam.leventhal@sun.com  * __                                                                 __
820*10105Sadam.leventhal@sun.com  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
821*10105Sadam.leventhal@sun.com  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
822*10105Sadam.leventhal@sun.com  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
823*10105Sadam.leventhal@sun.com  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
824*10105Sadam.leventhal@sun.com  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
825*10105Sadam.leventhal@sun.com  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
826*10105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
827*10105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
828*10105Sadam.leventhal@sun.com  * ~~                                                                 ~~
829*10105Sadam.leventhal@sun.com  * __                                                                 __
830*10105Sadam.leventhal@sun.com  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
831*10105Sadam.leventhal@sun.com  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
832*10105Sadam.leventhal@sun.com  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
833*10105Sadam.leventhal@sun.com  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
834*10105Sadam.leventhal@sun.com  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
835*10105Sadam.leventhal@sun.com  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
836*10105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
837*10105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
838*10105Sadam.leventhal@sun.com  * ~~                                                                 ~~
839*10105Sadam.leventhal@sun.com  * __                                                                 __
840*10105Sadam.leventhal@sun.com  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
841*10105Sadam.leventhal@sun.com  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
842*10105Sadam.leventhal@sun.com  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
843*10105Sadam.leventhal@sun.com  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
844*10105Sadam.leventhal@sun.com  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
845*10105Sadam.leventhal@sun.com  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
846*10105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
847*10105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
848*10105Sadam.leventhal@sun.com  * ~~                                                                 ~~
849*10105Sadam.leventhal@sun.com  *                   __                               __
850*10105Sadam.leventhal@sun.com  *                   |  0   0   1   0   0   0   0   0  |
851*10105Sadam.leventhal@sun.com  *                   | 167 100  5   41 159 169 217 208 |
852*10105Sadam.leventhal@sun.com  *                   | 166 100  4   40 158 168 216 209 |
853*10105Sadam.leventhal@sun.com  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
854*10105Sadam.leventhal@sun.com  *                   |  0   0   0   0   1   0   0   0  |
855*10105Sadam.leventhal@sun.com  *                   |  0   0   0   0   0   1   0   0  |
856*10105Sadam.leventhal@sun.com  *                   |  0   0   0   0   0   0   1   0  |
857*10105Sadam.leventhal@sun.com  *                   |  0   0   0   0   0   0   0   1  |
858*10105Sadam.leventhal@sun.com  *                   ~~                               ~~
859*10105Sadam.leventhal@sun.com  *
860*10105Sadam.leventhal@sun.com  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
861*10105Sadam.leventhal@sun.com  * of the missing data.
862*10105Sadam.leventhal@sun.com  *
863*10105Sadam.leventhal@sun.com  * As is apparent from the example above, the only non-trivial rows in the
864*10105Sadam.leventhal@sun.com  * inverse matrix correspond to the data disks that we're trying to
865*10105Sadam.leventhal@sun.com  * reconstruct. Indeed, those are the only rows we need as the others would
866*10105Sadam.leventhal@sun.com  * only be useful for reconstructing data known or assumed to be valid. For
867*10105Sadam.leventhal@sun.com  * that reason, we only build the coefficients in the rows that correspond to
868*10105Sadam.leventhal@sun.com  * targeted columns.
869*10105Sadam.leventhal@sun.com  */
870*10105Sadam.leventhal@sun.com /* END CSTYLED */
871*10105Sadam.leventhal@sun.com 
872*10105Sadam.leventhal@sun.com static void
873*10105Sadam.leventhal@sun.com vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
874*10105Sadam.leventhal@sun.com     uint8_t **rows)
875*10105Sadam.leventhal@sun.com {
876*10105Sadam.leventhal@sun.com 	int i, j;
877*10105Sadam.leventhal@sun.com 	int pow;
878*10105Sadam.leventhal@sun.com 
879*10105Sadam.leventhal@sun.com 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
880*10105Sadam.leventhal@sun.com 
881*10105Sadam.leventhal@sun.com 	/*
882*10105Sadam.leventhal@sun.com 	 * Fill in the missing rows of interest.
883*10105Sadam.leventhal@sun.com 	 */
884*10105Sadam.leventhal@sun.com 	for (i = 0; i < nmap; i++) {
885*10105Sadam.leventhal@sun.com 		ASSERT3S(0, <=, map[i]);
886*10105Sadam.leventhal@sun.com 		ASSERT3S(map[i], <=, 2);
887*10105Sadam.leventhal@sun.com 
888*10105Sadam.leventhal@sun.com 		pow = map[i] * n;
889*10105Sadam.leventhal@sun.com 		if (pow > 255)
890*10105Sadam.leventhal@sun.com 			pow -= 255;
891*10105Sadam.leventhal@sun.com 		ASSERT(pow <= 255);
892*10105Sadam.leventhal@sun.com 
893*10105Sadam.leventhal@sun.com 		for (j = 0; j < n; j++) {
894*10105Sadam.leventhal@sun.com 			pow -= map[i];
895*10105Sadam.leventhal@sun.com 			if (pow < 0)
896*10105Sadam.leventhal@sun.com 				pow += 255;
897*10105Sadam.leventhal@sun.com 			rows[i][j] = vdev_raidz_pow2[pow];
898*10105Sadam.leventhal@sun.com 		}
899*10105Sadam.leventhal@sun.com 	}
9002082Seschrock }
9012082Seschrock 
902*10105Sadam.leventhal@sun.com static void
903*10105Sadam.leventhal@sun.com vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
904*10105Sadam.leventhal@sun.com     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
905*10105Sadam.leventhal@sun.com {
906*10105Sadam.leventhal@sun.com 	int i, j, ii, jj;
907*10105Sadam.leventhal@sun.com 	uint8_t log;
908*10105Sadam.leventhal@sun.com 
909*10105Sadam.leventhal@sun.com 	/*
910*10105Sadam.leventhal@sun.com 	 * Assert that the first nmissing entries from the array of used
911*10105Sadam.leventhal@sun.com 	 * columns correspond to parity columns and that subsequent entries
912*10105Sadam.leventhal@sun.com 	 * correspond to data columns.
913*10105Sadam.leventhal@sun.com 	 */
914*10105Sadam.leventhal@sun.com 	for (i = 0; i < nmissing; i++) {
915*10105Sadam.leventhal@sun.com 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
916*10105Sadam.leventhal@sun.com 	}
917*10105Sadam.leventhal@sun.com 	for (; i < n; i++) {
918*10105Sadam.leventhal@sun.com 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
919*10105Sadam.leventhal@sun.com 	}
920*10105Sadam.leventhal@sun.com 
921*10105Sadam.leventhal@sun.com 	/*
922*10105Sadam.leventhal@sun.com 	 * First initialize the storage where we'll compute the inverse rows.
923*10105Sadam.leventhal@sun.com 	 */
924*10105Sadam.leventhal@sun.com 	for (i = 0; i < nmissing; i++) {
925*10105Sadam.leventhal@sun.com 		for (j = 0; j < n; j++) {
926*10105Sadam.leventhal@sun.com 			invrows[i][j] = (i == j) ? 1 : 0;
927*10105Sadam.leventhal@sun.com 		}
928*10105Sadam.leventhal@sun.com 	}
929*10105Sadam.leventhal@sun.com 
930*10105Sadam.leventhal@sun.com 	/*
931*10105Sadam.leventhal@sun.com 	 * Subtract all trivial rows from the rows of consequence.
932*10105Sadam.leventhal@sun.com 	 */
933*10105Sadam.leventhal@sun.com 	for (i = 0; i < nmissing; i++) {
934*10105Sadam.leventhal@sun.com 		for (j = nmissing; j < n; j++) {
935*10105Sadam.leventhal@sun.com 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
936*10105Sadam.leventhal@sun.com 			jj = used[j] - rm->rm_firstdatacol;
937*10105Sadam.leventhal@sun.com 			ASSERT3S(jj, <, n);
938*10105Sadam.leventhal@sun.com 			invrows[i][j] = rows[i][jj];
939*10105Sadam.leventhal@sun.com 			rows[i][jj] = 0;
940*10105Sadam.leventhal@sun.com 		}
941*10105Sadam.leventhal@sun.com 	}
942*10105Sadam.leventhal@sun.com 
943*10105Sadam.leventhal@sun.com 	/*
944*10105Sadam.leventhal@sun.com 	 * For each of the rows of interest, we must normalize it and subtract
945*10105Sadam.leventhal@sun.com 	 * a multiple of it from the other rows.
946*10105Sadam.leventhal@sun.com 	 */
947*10105Sadam.leventhal@sun.com 	for (i = 0; i < nmissing; i++) {
948*10105Sadam.leventhal@sun.com 		for (j = 0; j < missing[i]; j++) {
949*10105Sadam.leventhal@sun.com 			ASSERT3U(rows[i][j], ==, 0);
950*10105Sadam.leventhal@sun.com 		}
951*10105Sadam.leventhal@sun.com 		ASSERT3U(rows[i][missing[i]], !=, 0);
952*10105Sadam.leventhal@sun.com 
953*10105Sadam.leventhal@sun.com 		/*
954*10105Sadam.leventhal@sun.com 		 * Compute the inverse of the first element and multiply each
955*10105Sadam.leventhal@sun.com 		 * element in the row by that value.
956*10105Sadam.leventhal@sun.com 		 */
957*10105Sadam.leventhal@sun.com 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
958*10105Sadam.leventhal@sun.com 
959*10105Sadam.leventhal@sun.com 		for (j = 0; j < n; j++) {
960*10105Sadam.leventhal@sun.com 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
961*10105Sadam.leventhal@sun.com 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
962*10105Sadam.leventhal@sun.com 		}
963*10105Sadam.leventhal@sun.com 
964*10105Sadam.leventhal@sun.com 		for (ii = 0; ii < nmissing; ii++) {
965*10105Sadam.leventhal@sun.com 			if (i == ii)
966*10105Sadam.leventhal@sun.com 				continue;
967*10105Sadam.leventhal@sun.com 
968*10105Sadam.leventhal@sun.com 			ASSERT3U(rows[ii][missing[i]], !=, 0);
969*10105Sadam.leventhal@sun.com 
970*10105Sadam.leventhal@sun.com 			log = vdev_raidz_log2[rows[ii][missing[i]]];
971*10105Sadam.leventhal@sun.com 
972*10105Sadam.leventhal@sun.com 			for (j = 0; j < n; j++) {
973*10105Sadam.leventhal@sun.com 				rows[ii][j] ^=
974*10105Sadam.leventhal@sun.com 				    vdev_raidz_exp2(rows[i][j], log);
975*10105Sadam.leventhal@sun.com 				invrows[ii][j] ^=
976*10105Sadam.leventhal@sun.com 				    vdev_raidz_exp2(invrows[i][j], log);
977*10105Sadam.leventhal@sun.com 			}
978*10105Sadam.leventhal@sun.com 		}
979*10105Sadam.leventhal@sun.com 	}
980*10105Sadam.leventhal@sun.com 
981*10105Sadam.leventhal@sun.com 	/*
982*10105Sadam.leventhal@sun.com 	 * Verify that the data that is left in the rows are properly part of
983*10105Sadam.leventhal@sun.com 	 * an identity matrix.
984*10105Sadam.leventhal@sun.com 	 */
985*10105Sadam.leventhal@sun.com 	for (i = 0; i < nmissing; i++) {
986*10105Sadam.leventhal@sun.com 		for (j = 0; j < n; j++) {
987*10105Sadam.leventhal@sun.com 			if (j == missing[i]) {
988*10105Sadam.leventhal@sun.com 				ASSERT3U(rows[i][j], ==, 1);
989*10105Sadam.leventhal@sun.com 			} else {
990*10105Sadam.leventhal@sun.com 				ASSERT3U(rows[i][j], ==, 0);
991*10105Sadam.leventhal@sun.com 			}
992*10105Sadam.leventhal@sun.com 		}
993*10105Sadam.leventhal@sun.com 	}
994*10105Sadam.leventhal@sun.com }
995*10105Sadam.leventhal@sun.com 
996*10105Sadam.leventhal@sun.com static void
997*10105Sadam.leventhal@sun.com vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
998*10105Sadam.leventhal@sun.com     int *missing, uint8_t **invrows, const uint8_t *used)
999*10105Sadam.leventhal@sun.com {
1000*10105Sadam.leventhal@sun.com 	int i, j, x, cc, c;
1001*10105Sadam.leventhal@sun.com 	uint8_t *src;
1002*10105Sadam.leventhal@sun.com 	uint64_t ccount;
1003*10105Sadam.leventhal@sun.com 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1004*10105Sadam.leventhal@sun.com 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1005*10105Sadam.leventhal@sun.com 	uint8_t log, val;
1006*10105Sadam.leventhal@sun.com 	int ll;
1007*10105Sadam.leventhal@sun.com 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1008*10105Sadam.leventhal@sun.com 	uint8_t *p, *pp;
1009*10105Sadam.leventhal@sun.com 	size_t psize;
1010*10105Sadam.leventhal@sun.com 
1011*10105Sadam.leventhal@sun.com 	psize = sizeof (invlog[0][0]) * n * nmissing;
1012*10105Sadam.leventhal@sun.com 	p = kmem_alloc(psize, KM_SLEEP);
1013*10105Sadam.leventhal@sun.com 
1014*10105Sadam.leventhal@sun.com 	for (pp = p, i = 0; i < nmissing; i++) {
1015*10105Sadam.leventhal@sun.com 		invlog[i] = pp;
1016*10105Sadam.leventhal@sun.com 		pp += n;
1017*10105Sadam.leventhal@sun.com 	}
1018*10105Sadam.leventhal@sun.com 
1019*10105Sadam.leventhal@sun.com 	for (i = 0; i < nmissing; i++) {
1020*10105Sadam.leventhal@sun.com 		for (j = 0; j < n; j++) {
1021*10105Sadam.leventhal@sun.com 			ASSERT3U(invrows[i][j], !=, 0);
1022*10105Sadam.leventhal@sun.com 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1023*10105Sadam.leventhal@sun.com 		}
1024*10105Sadam.leventhal@sun.com 	}
1025*10105Sadam.leventhal@sun.com 
1026*10105Sadam.leventhal@sun.com 	for (i = 0; i < n; i++) {
1027*10105Sadam.leventhal@sun.com 		c = used[i];
1028*10105Sadam.leventhal@sun.com 		ASSERT3U(c, <, rm->rm_cols);
1029*10105Sadam.leventhal@sun.com 
1030*10105Sadam.leventhal@sun.com 		src = rm->rm_col[c].rc_data;
1031*10105Sadam.leventhal@sun.com 		ccount = rm->rm_col[c].rc_size;
1032*10105Sadam.leventhal@sun.com 		for (j = 0; j < nmissing; j++) {
1033*10105Sadam.leventhal@sun.com 			cc = missing[j] + rm->rm_firstdatacol;
1034*10105Sadam.leventhal@sun.com 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1035*10105Sadam.leventhal@sun.com 			ASSERT3U(cc, <, rm->rm_cols);
1036*10105Sadam.leventhal@sun.com 			ASSERT3U(cc, !=, c);
1037*10105Sadam.leventhal@sun.com 
1038*10105Sadam.leventhal@sun.com 			dst[j] = rm->rm_col[cc].rc_data;
1039*10105Sadam.leventhal@sun.com 			dcount[j] = rm->rm_col[cc].rc_size;
1040*10105Sadam.leventhal@sun.com 		}
1041*10105Sadam.leventhal@sun.com 
1042*10105Sadam.leventhal@sun.com 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1043*10105Sadam.leventhal@sun.com 
1044*10105Sadam.leventhal@sun.com 		for (x = 0; x < ccount; x++, src++) {
1045*10105Sadam.leventhal@sun.com 			if (*src != 0)
1046*10105Sadam.leventhal@sun.com 				log = vdev_raidz_log2[*src];
1047*10105Sadam.leventhal@sun.com 
1048*10105Sadam.leventhal@sun.com 			for (cc = 0; cc < nmissing; cc++) {
1049*10105Sadam.leventhal@sun.com 				if (x >= dcount[cc])
1050*10105Sadam.leventhal@sun.com 					continue;
1051*10105Sadam.leventhal@sun.com 
1052*10105Sadam.leventhal@sun.com 				if (*src == 0) {
1053*10105Sadam.leventhal@sun.com 					val = 0;
1054*10105Sadam.leventhal@sun.com 				} else {
1055*10105Sadam.leventhal@sun.com 					if ((ll = log + invlog[cc][i]) >= 255)
1056*10105Sadam.leventhal@sun.com 						ll -= 255;
1057*10105Sadam.leventhal@sun.com 					val = vdev_raidz_pow2[ll];
1058*10105Sadam.leventhal@sun.com 				}
1059*10105Sadam.leventhal@sun.com 
1060*10105Sadam.leventhal@sun.com 				if (i == 0)
1061*10105Sadam.leventhal@sun.com 					dst[cc][x] = val;
1062*10105Sadam.leventhal@sun.com 				else
1063*10105Sadam.leventhal@sun.com 					dst[cc][x] ^= val;
1064*10105Sadam.leventhal@sun.com 			}
1065*10105Sadam.leventhal@sun.com 		}
1066*10105Sadam.leventhal@sun.com 	}
1067*10105Sadam.leventhal@sun.com 
1068*10105Sadam.leventhal@sun.com 	kmem_free(p, psize);
1069*10105Sadam.leventhal@sun.com }
1070*10105Sadam.leventhal@sun.com 
1071*10105Sadam.leventhal@sun.com static int
1072*10105Sadam.leventhal@sun.com vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1073*10105Sadam.leventhal@sun.com {
1074*10105Sadam.leventhal@sun.com 	int n, i, c, t, tt;
1075*10105Sadam.leventhal@sun.com 	int nmissing_rows;
1076*10105Sadam.leventhal@sun.com 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1077*10105Sadam.leventhal@sun.com 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1078*10105Sadam.leventhal@sun.com 
1079*10105Sadam.leventhal@sun.com 	uint8_t *p, *pp;
1080*10105Sadam.leventhal@sun.com 	size_t psize;
1081*10105Sadam.leventhal@sun.com 
1082*10105Sadam.leventhal@sun.com 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1083*10105Sadam.leventhal@sun.com 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1084*10105Sadam.leventhal@sun.com 	uint8_t *used;
1085*10105Sadam.leventhal@sun.com 
1086*10105Sadam.leventhal@sun.com 	int code = 0;
1087*10105Sadam.leventhal@sun.com 
1088*10105Sadam.leventhal@sun.com 
1089*10105Sadam.leventhal@sun.com 	n = rm->rm_cols - rm->rm_firstdatacol;
1090*10105Sadam.leventhal@sun.com 
1091*10105Sadam.leventhal@sun.com 	/*
1092*10105Sadam.leventhal@sun.com 	 * Figure out which data columns are missing.
1093*10105Sadam.leventhal@sun.com 	 */
1094*10105Sadam.leventhal@sun.com 	nmissing_rows = 0;
1095*10105Sadam.leventhal@sun.com 	for (t = 0; t < ntgts; t++) {
1096*10105Sadam.leventhal@sun.com 		if (tgts[t] >= rm->rm_firstdatacol) {
1097*10105Sadam.leventhal@sun.com 			missing_rows[nmissing_rows++] =
1098*10105Sadam.leventhal@sun.com 			    tgts[t] - rm->rm_firstdatacol;
1099*10105Sadam.leventhal@sun.com 		}
1100*10105Sadam.leventhal@sun.com 	}
1101*10105Sadam.leventhal@sun.com 
1102*10105Sadam.leventhal@sun.com 	/*
1103*10105Sadam.leventhal@sun.com 	 * Figure out which parity columns to use to help generate the missing
1104*10105Sadam.leventhal@sun.com 	 * data columns.
1105*10105Sadam.leventhal@sun.com 	 */
1106*10105Sadam.leventhal@sun.com 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1107*10105Sadam.leventhal@sun.com 		ASSERT(tt < ntgts);
1108*10105Sadam.leventhal@sun.com 		ASSERT(c < rm->rm_firstdatacol);
1109*10105Sadam.leventhal@sun.com 
1110*10105Sadam.leventhal@sun.com 		/*
1111*10105Sadam.leventhal@sun.com 		 * Skip any targeted parity columns.
1112*10105Sadam.leventhal@sun.com 		 */
1113*10105Sadam.leventhal@sun.com 		if (c == tgts[tt]) {
1114*10105Sadam.leventhal@sun.com 			tt++;
1115*10105Sadam.leventhal@sun.com 			continue;
1116*10105Sadam.leventhal@sun.com 		}
1117*10105Sadam.leventhal@sun.com 
1118*10105Sadam.leventhal@sun.com 		code |= 1 << c;
1119*10105Sadam.leventhal@sun.com 
1120*10105Sadam.leventhal@sun.com 		parity_map[i] = c;
1121*10105Sadam.leventhal@sun.com 		i++;
1122*10105Sadam.leventhal@sun.com 	}
1123*10105Sadam.leventhal@sun.com 
1124*10105Sadam.leventhal@sun.com 	ASSERT(code != 0);
1125*10105Sadam.leventhal@sun.com 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1126*10105Sadam.leventhal@sun.com 
1127*10105Sadam.leventhal@sun.com 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1128*10105Sadam.leventhal@sun.com 	    nmissing_rows * n + sizeof (used[0]) * n;
1129*10105Sadam.leventhal@sun.com 	p = kmem_alloc(psize, KM_SLEEP);
1130*10105Sadam.leventhal@sun.com 
1131*10105Sadam.leventhal@sun.com 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1132*10105Sadam.leventhal@sun.com 		rows[i] = pp;
1133*10105Sadam.leventhal@sun.com 		pp += n;
1134*10105Sadam.leventhal@sun.com 		invrows[i] = pp;
1135*10105Sadam.leventhal@sun.com 		pp += n;
1136*10105Sadam.leventhal@sun.com 	}
1137*10105Sadam.leventhal@sun.com 	used = pp;
1138*10105Sadam.leventhal@sun.com 
1139*10105Sadam.leventhal@sun.com 	for (i = 0; i < nmissing_rows; i++) {
1140*10105Sadam.leventhal@sun.com 		used[i] = parity_map[i];
1141*10105Sadam.leventhal@sun.com 	}
1142*10105Sadam.leventhal@sun.com 
1143*10105Sadam.leventhal@sun.com 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1144*10105Sadam.leventhal@sun.com 		if (tt < nmissing_rows &&
1145*10105Sadam.leventhal@sun.com 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1146*10105Sadam.leventhal@sun.com 			tt++;
1147*10105Sadam.leventhal@sun.com 			continue;
1148*10105Sadam.leventhal@sun.com 		}
1149*10105Sadam.leventhal@sun.com 
1150*10105Sadam.leventhal@sun.com 		ASSERT3S(i, <, n);
1151*10105Sadam.leventhal@sun.com 		used[i] = c;
1152*10105Sadam.leventhal@sun.com 		i++;
1153*10105Sadam.leventhal@sun.com 	}
1154*10105Sadam.leventhal@sun.com 
1155*10105Sadam.leventhal@sun.com 	/*
1156*10105Sadam.leventhal@sun.com 	 * Initialize the interesting rows of the matrix.
1157*10105Sadam.leventhal@sun.com 	 */
1158*10105Sadam.leventhal@sun.com 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1159*10105Sadam.leventhal@sun.com 
1160*10105Sadam.leventhal@sun.com 	/*
1161*10105Sadam.leventhal@sun.com 	 * Invert the matrix.
1162*10105Sadam.leventhal@sun.com 	 */
1163*10105Sadam.leventhal@sun.com 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1164*10105Sadam.leventhal@sun.com 	    invrows, used);
1165*10105Sadam.leventhal@sun.com 
1166*10105Sadam.leventhal@sun.com 	/*
1167*10105Sadam.leventhal@sun.com 	 * Reconstruct the missing data using the generated matrix.
1168*10105Sadam.leventhal@sun.com 	 */
1169*10105Sadam.leventhal@sun.com 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1170*10105Sadam.leventhal@sun.com 	    invrows, used);
1171*10105Sadam.leventhal@sun.com 
1172*10105Sadam.leventhal@sun.com 	kmem_free(p, psize);
1173*10105Sadam.leventhal@sun.com 
1174*10105Sadam.leventhal@sun.com 	return (code);
1175*10105Sadam.leventhal@sun.com }
1176*10105Sadam.leventhal@sun.com 
1177*10105Sadam.leventhal@sun.com static int
1178*10105Sadam.leventhal@sun.com vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1179*10105Sadam.leventhal@sun.com {
1180*10105Sadam.leventhal@sun.com 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1181*10105Sadam.leventhal@sun.com 	int ntgts;
1182*10105Sadam.leventhal@sun.com 	int i, c;
1183*10105Sadam.leventhal@sun.com 	int code;
1184*10105Sadam.leventhal@sun.com 	int nbadparity, nbaddata;
1185*10105Sadam.leventhal@sun.com 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1186*10105Sadam.leventhal@sun.com 
1187*10105Sadam.leventhal@sun.com 	/*
1188*10105Sadam.leventhal@sun.com 	 * The tgts list must already be sorted.
1189*10105Sadam.leventhal@sun.com 	 */
1190*10105Sadam.leventhal@sun.com 	for (i = 1; i < nt; i++) {
1191*10105Sadam.leventhal@sun.com 		ASSERT(t[i] > t[i - 1]);
1192*10105Sadam.leventhal@sun.com 	}
1193*10105Sadam.leventhal@sun.com 
1194*10105Sadam.leventhal@sun.com 	nbadparity = rm->rm_firstdatacol;
1195*10105Sadam.leventhal@sun.com 	nbaddata = rm->rm_cols - nbadparity;
1196*10105Sadam.leventhal@sun.com 	ntgts = 0;
1197*10105Sadam.leventhal@sun.com 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1198*10105Sadam.leventhal@sun.com 		if (c < rm->rm_firstdatacol)
1199*10105Sadam.leventhal@sun.com 			parity_valid[c] = B_FALSE;
1200*10105Sadam.leventhal@sun.com 
1201*10105Sadam.leventhal@sun.com 		if (i < nt && c == t[i]) {
1202*10105Sadam.leventhal@sun.com 			tgts[ntgts++] = c;
1203*10105Sadam.leventhal@sun.com 			i++;
1204*10105Sadam.leventhal@sun.com 		} else if (rm->rm_col[c].rc_error != 0) {
1205*10105Sadam.leventhal@sun.com 			tgts[ntgts++] = c;
1206*10105Sadam.leventhal@sun.com 		} else if (c >= rm->rm_firstdatacol) {
1207*10105Sadam.leventhal@sun.com 			nbaddata--;
1208*10105Sadam.leventhal@sun.com 		} else {
1209*10105Sadam.leventhal@sun.com 			parity_valid[c] = B_TRUE;
1210*10105Sadam.leventhal@sun.com 			nbadparity--;
1211*10105Sadam.leventhal@sun.com 		}
1212*10105Sadam.leventhal@sun.com 	}
1213*10105Sadam.leventhal@sun.com 
1214*10105Sadam.leventhal@sun.com 	ASSERT(ntgts >= nt);
1215*10105Sadam.leventhal@sun.com 	ASSERT(nbaddata >= 0);
1216*10105Sadam.leventhal@sun.com 	ASSERT(nbaddata + nbadparity == ntgts);
1217*10105Sadam.leventhal@sun.com 
1218*10105Sadam.leventhal@sun.com 	dt = &tgts[nbadparity];
1219*10105Sadam.leventhal@sun.com 
1220*10105Sadam.leventhal@sun.com 	/*
1221*10105Sadam.leventhal@sun.com 	 * See if we can use any of our optimized reconstruction routines.
1222*10105Sadam.leventhal@sun.com 	 */
1223*10105Sadam.leventhal@sun.com 	if (!vdev_raidz_default_to_general) {
1224*10105Sadam.leventhal@sun.com 		switch (nbaddata) {
1225*10105Sadam.leventhal@sun.com 		case 1:
1226*10105Sadam.leventhal@sun.com 			if (parity_valid[VDEV_RAIDZ_P])
1227*10105Sadam.leventhal@sun.com 				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1228*10105Sadam.leventhal@sun.com 
1229*10105Sadam.leventhal@sun.com 			ASSERT(rm->rm_firstdatacol > 1);
1230*10105Sadam.leventhal@sun.com 
1231*10105Sadam.leventhal@sun.com 			if (parity_valid[VDEV_RAIDZ_Q])
1232*10105Sadam.leventhal@sun.com 				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1233*10105Sadam.leventhal@sun.com 
1234*10105Sadam.leventhal@sun.com 			ASSERT(rm->rm_firstdatacol > 2);
1235*10105Sadam.leventhal@sun.com 			break;
1236*10105Sadam.leventhal@sun.com 
1237*10105Sadam.leventhal@sun.com 		case 2:
1238*10105Sadam.leventhal@sun.com 			ASSERT(rm->rm_firstdatacol > 1);
1239*10105Sadam.leventhal@sun.com 
1240*10105Sadam.leventhal@sun.com 			if (parity_valid[VDEV_RAIDZ_P] &&
1241*10105Sadam.leventhal@sun.com 			    parity_valid[VDEV_RAIDZ_Q])
1242*10105Sadam.leventhal@sun.com 				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1243*10105Sadam.leventhal@sun.com 
1244*10105Sadam.leventhal@sun.com 			ASSERT(rm->rm_firstdatacol > 2);
1245*10105Sadam.leventhal@sun.com 
1246*10105Sadam.leventhal@sun.com 			break;
1247*10105Sadam.leventhal@sun.com 		}
1248*10105Sadam.leventhal@sun.com 	}
1249*10105Sadam.leventhal@sun.com 
1250*10105Sadam.leventhal@sun.com 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1251*10105Sadam.leventhal@sun.com 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1252*10105Sadam.leventhal@sun.com 	ASSERT(code > 0);
1253*10105Sadam.leventhal@sun.com 	return (code);
1254*10105Sadam.leventhal@sun.com }
12552082Seschrock 
1256789Sahrens static int
1257789Sahrens vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
1258789Sahrens {
1259*10105Sadam.leventhal@sun.com 	vdev_t *cvd;
12602082Seschrock 	uint64_t nparity = vd->vdev_nparity;
1261*10105Sadam.leventhal@sun.com 	int c;
1262789Sahrens 	int lasterror = 0;
1263789Sahrens 	int numerrors = 0;
1264789Sahrens 
12652082Seschrock 	ASSERT(nparity > 0);
12662082Seschrock 
12672082Seschrock 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
12682082Seschrock 	    vd->vdev_children < nparity + 1) {
1269789Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1270789Sahrens 		return (EINVAL);
1271789Sahrens 	}
1272789Sahrens 
12739846SEric.Taylor@Sun.COM 	vdev_open_children(vd);
1274789Sahrens 
1275*10105Sadam.leventhal@sun.com 	for (c = 0; c < vd->vdev_children; c++) {
1276*10105Sadam.leventhal@sun.com 		cvd = vd->vdev_child[c];
12779846SEric.Taylor@Sun.COM 
1278*10105Sadam.leventhal@sun.com 		if (cvd->vdev_open_error != 0) {
12799846SEric.Taylor@Sun.COM 			lasterror = cvd->vdev_open_error;
1280789Sahrens 			numerrors++;
1281789Sahrens 			continue;
1282789Sahrens 		}
1283789Sahrens 
1284789Sahrens 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
12851732Sbonwick 		*ashift = MAX(*ashift, cvd->vdev_ashift);
1286789Sahrens 	}
1287789Sahrens 
1288789Sahrens 	*asize *= vd->vdev_children;
1289789Sahrens 
12902082Seschrock 	if (numerrors > nparity) {
1291789Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1292789Sahrens 		return (lasterror);
1293789Sahrens 	}
1294789Sahrens 
1295789Sahrens 	return (0);
1296789Sahrens }
1297789Sahrens 
1298789Sahrens static void
1299789Sahrens vdev_raidz_close(vdev_t *vd)
1300789Sahrens {
1301*10105Sadam.leventhal@sun.com 	int c;
1302*10105Sadam.leventhal@sun.com 
1303*10105Sadam.leventhal@sun.com 	for (c = 0; c < vd->vdev_children; c++)
1304789Sahrens 		vdev_close(vd->vdev_child[c]);
1305789Sahrens }
1306789Sahrens 
1307789Sahrens static uint64_t
1308789Sahrens vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1309789Sahrens {
1310789Sahrens 	uint64_t asize;
13111732Sbonwick 	uint64_t ashift = vd->vdev_top->vdev_ashift;
1312789Sahrens 	uint64_t cols = vd->vdev_children;
13132082Seschrock 	uint64_t nparity = vd->vdev_nparity;
1314789Sahrens 
13151732Sbonwick 	asize = ((psize - 1) >> ashift) + 1;
13162082Seschrock 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
13172082Seschrock 	asize = roundup(asize, nparity + 1) << ashift;
1318789Sahrens 
1319789Sahrens 	return (asize);
1320789Sahrens }
1321789Sahrens 
1322789Sahrens static void
1323789Sahrens vdev_raidz_child_done(zio_t *zio)
1324789Sahrens {
1325789Sahrens 	raidz_col_t *rc = zio->io_private;
1326789Sahrens 
1327789Sahrens 	rc->rc_error = zio->io_error;
1328789Sahrens 	rc->rc_tried = 1;
1329789Sahrens 	rc->rc_skipped = 0;
1330789Sahrens }
1331789Sahrens 
13325530Sbonwick static int
1333789Sahrens vdev_raidz_io_start(zio_t *zio)
1334789Sahrens {
1335789Sahrens 	vdev_t *vd = zio->io_vd;
13361732Sbonwick 	vdev_t *tvd = vd->vdev_top;
1337789Sahrens 	vdev_t *cvd;
1338789Sahrens 	blkptr_t *bp = zio->io_bp;
1339789Sahrens 	raidz_map_t *rm;
1340789Sahrens 	raidz_col_t *rc;
1341*10105Sadam.leventhal@sun.com 	int c, i;
1342789Sahrens 
13432082Seschrock 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
13442082Seschrock 	    vd->vdev_nparity);
1345789Sahrens 
13461775Sbillm 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1347789Sahrens 
1348789Sahrens 	if (zio->io_type == ZIO_TYPE_WRITE) {
1349*10105Sadam.leventhal@sun.com 		vdev_raidz_generate_parity(rm);
1350789Sahrens 
1351789Sahrens 		for (c = 0; c < rm->rm_cols; c++) {
1352789Sahrens 			rc = &rm->rm_col[c];
13532082Seschrock 			cvd = vd->vdev_child[rc->rc_devidx];
1354789Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1355789Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
13567754SJeff.Bonwick@Sun.COM 			    zio->io_type, zio->io_priority, 0,
1357789Sahrens 			    vdev_raidz_child_done, rc));
1358789Sahrens 		}
13595530Sbonwick 
1360*10105Sadam.leventhal@sun.com 		/*
1361*10105Sadam.leventhal@sun.com 		 * Generate optional I/Os for any skipped sectors to improve
1362*10105Sadam.leventhal@sun.com 		 * aggregation contiguity.
1363*10105Sadam.leventhal@sun.com 		 */
1364*10105Sadam.leventhal@sun.com 		for (c = rm->rm_bigcols, i = 0; i < rm->rm_skipped; c++, i++) {
1365*10105Sadam.leventhal@sun.com 			ASSERT(c <= rm->rm_scols);
1366*10105Sadam.leventhal@sun.com 			if (c == rm->rm_scols)
1367*10105Sadam.leventhal@sun.com 				c = 0;
1368*10105Sadam.leventhal@sun.com 			rc = &rm->rm_col[c];
1369*10105Sadam.leventhal@sun.com 			cvd = vd->vdev_child[rc->rc_devidx];
1370*10105Sadam.leventhal@sun.com 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1371*10105Sadam.leventhal@sun.com 			    rc->rc_offset + rc->rc_size, NULL,
1372*10105Sadam.leventhal@sun.com 			    1 << tvd->vdev_ashift,
1373*10105Sadam.leventhal@sun.com 			    zio->io_type, zio->io_priority,
1374*10105Sadam.leventhal@sun.com 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1375*10105Sadam.leventhal@sun.com 		}
1376*10105Sadam.leventhal@sun.com 
13777754SJeff.Bonwick@Sun.COM 		return (ZIO_PIPELINE_CONTINUE);
1378789Sahrens 	}
1379789Sahrens 
1380789Sahrens 	ASSERT(zio->io_type == ZIO_TYPE_READ);
1381789Sahrens 
13822082Seschrock 	/*
13832082Seschrock 	 * Iterate over the columns in reverse order so that we hit the parity
1384*10105Sadam.leventhal@sun.com 	 * last -- any errors along the way will force us to read the parity.
13852082Seschrock 	 */
1386789Sahrens 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1387789Sahrens 		rc = &rm->rm_col[c];
13882082Seschrock 		cvd = vd->vdev_child[rc->rc_devidx];
13895329Sgw25295 		if (!vdev_readable(cvd)) {
13902082Seschrock 			if (c >= rm->rm_firstdatacol)
13912082Seschrock 				rm->rm_missingdata++;
13922082Seschrock 			else
13932082Seschrock 				rm->rm_missingparity++;
1394789Sahrens 			rc->rc_error = ENXIO;
1395789Sahrens 			rc->rc_tried = 1;	/* don't even try */
1396789Sahrens 			rc->rc_skipped = 1;
1397789Sahrens 			continue;
1398789Sahrens 		}
13998241SJeff.Bonwick@Sun.COM 		if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) {
14002082Seschrock 			if (c >= rm->rm_firstdatacol)
14012082Seschrock 				rm->rm_missingdata++;
14022082Seschrock 			else
14032082Seschrock 				rm->rm_missingparity++;
1404789Sahrens 			rc->rc_error = ESTALE;
1405789Sahrens 			rc->rc_skipped = 1;
1406789Sahrens 			continue;
1407789Sahrens 		}
14082082Seschrock 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
14099434SMark.Musante@Sun.COM 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1410789Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1411789Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
14127754SJeff.Bonwick@Sun.COM 			    zio->io_type, zio->io_priority, 0,
1413789Sahrens 			    vdev_raidz_child_done, rc));
1414789Sahrens 		}
1415789Sahrens 	}
1416789Sahrens 
14177754SJeff.Bonwick@Sun.COM 	return (ZIO_PIPELINE_CONTINUE);
1418789Sahrens }
1419789Sahrens 
14201544Seschrock /*
14211544Seschrock  * Report a checksum error for a child of a RAID-Z device.
14221544Seschrock  */
14231544Seschrock static void
14241544Seschrock raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
14251544Seschrock {
14262082Seschrock 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
14271544Seschrock 
14281544Seschrock 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
14291544Seschrock 		mutex_enter(&vd->vdev_stat_lock);
14301544Seschrock 		vd->vdev_stat.vs_checksum_errors++;
14311544Seschrock 		mutex_exit(&vd->vdev_stat_lock);
14321544Seschrock 	}
14331544Seschrock 
14341544Seschrock 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
14351544Seschrock 		zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
14361544Seschrock 		    zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
14371544Seschrock }
14381544Seschrock 
14392082Seschrock /*
14402082Seschrock  * Generate the parity from the data columns. If we tried and were able to
14412082Seschrock  * read the parity without error, verify that the generated parity matches the
14422082Seschrock  * data we read. If it doesn't, we fire off a checksum error. Return the
14432082Seschrock  * number such failures.
14442082Seschrock  */
14452082Seschrock static int
14462082Seschrock raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
14472082Seschrock {
14482082Seschrock 	void *orig[VDEV_RAIDZ_MAXPARITY];
14492082Seschrock 	int c, ret = 0;
14502082Seschrock 	raidz_col_t *rc;
14512082Seschrock 
14522082Seschrock 	for (c = 0; c < rm->rm_firstdatacol; c++) {
14532082Seschrock 		rc = &rm->rm_col[c];
14542082Seschrock 		if (!rc->rc_tried || rc->rc_error != 0)
14552082Seschrock 			continue;
14562082Seschrock 		orig[c] = zio_buf_alloc(rc->rc_size);
14572082Seschrock 		bcopy(rc->rc_data, orig[c], rc->rc_size);
14582082Seschrock 	}
14592082Seschrock 
1460*10105Sadam.leventhal@sun.com 	vdev_raidz_generate_parity(rm);
14612082Seschrock 
14622082Seschrock 	for (c = 0; c < rm->rm_firstdatacol; c++) {
14632082Seschrock 		rc = &rm->rm_col[c];
14642082Seschrock 		if (!rc->rc_tried || rc->rc_error != 0)
14652082Seschrock 			continue;
14662082Seschrock 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
14672082Seschrock 			raidz_checksum_error(zio, rc);
14682082Seschrock 			rc->rc_error = ECKSUM;
14692082Seschrock 			ret++;
14702082Seschrock 		}
14712082Seschrock 		zio_buf_free(orig[c], rc->rc_size);
14722082Seschrock 	}
14732082Seschrock 
14742082Seschrock 	return (ret);
14752082Seschrock }
14762082Seschrock 
1477*10105Sadam.leventhal@sun.com /*
1478*10105Sadam.leventhal@sun.com  * Keep statistics on all the ways that we used parity to correct data.
1479*10105Sadam.leventhal@sun.com  */
1480*10105Sadam.leventhal@sun.com static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
14811544Seschrock 
14825530Sbonwick static int
14837754SJeff.Bonwick@Sun.COM vdev_raidz_worst_error(raidz_map_t *rm)
14847754SJeff.Bonwick@Sun.COM {
14857754SJeff.Bonwick@Sun.COM 	int error = 0;
14867754SJeff.Bonwick@Sun.COM 
14877754SJeff.Bonwick@Sun.COM 	for (int c = 0; c < rm->rm_cols; c++)
14887754SJeff.Bonwick@Sun.COM 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
14897754SJeff.Bonwick@Sun.COM 
14907754SJeff.Bonwick@Sun.COM 	return (error);
14917754SJeff.Bonwick@Sun.COM }
14927754SJeff.Bonwick@Sun.COM 
1493*10105Sadam.leventhal@sun.com /*
1494*10105Sadam.leventhal@sun.com  * Iterate over all combinations of bad data and attempt a reconstruction.
1495*10105Sadam.leventhal@sun.com  * Note that the algorithm below is non-optimal because it doesn't take into
1496*10105Sadam.leventhal@sun.com  * account how reconstruction is actually performed. For example, with
1497*10105Sadam.leventhal@sun.com  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1498*10105Sadam.leventhal@sun.com  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1499*10105Sadam.leventhal@sun.com  * cases we'd only use parity information in column 0.
1500*10105Sadam.leventhal@sun.com  */
1501*10105Sadam.leventhal@sun.com static int
1502*10105Sadam.leventhal@sun.com vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1503*10105Sadam.leventhal@sun.com {
1504*10105Sadam.leventhal@sun.com 	raidz_map_t *rm = zio->io_vsd;
1505*10105Sadam.leventhal@sun.com 	raidz_col_t *rc;
1506*10105Sadam.leventhal@sun.com 	void *orig[VDEV_RAIDZ_MAXPARITY];
1507*10105Sadam.leventhal@sun.com 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1508*10105Sadam.leventhal@sun.com 	int *tgts = &tstore[1];
1509*10105Sadam.leventhal@sun.com 	int current, next, i, c, n;
1510*10105Sadam.leventhal@sun.com 	int code, ret = 0;
1511*10105Sadam.leventhal@sun.com 
1512*10105Sadam.leventhal@sun.com 	ASSERT(total_errors < rm->rm_firstdatacol);
1513*10105Sadam.leventhal@sun.com 
1514*10105Sadam.leventhal@sun.com 	/*
1515*10105Sadam.leventhal@sun.com 	 * This simplifies one edge condition.
1516*10105Sadam.leventhal@sun.com 	 */
1517*10105Sadam.leventhal@sun.com 	tgts[-1] = -1;
1518*10105Sadam.leventhal@sun.com 
1519*10105Sadam.leventhal@sun.com 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1520*10105Sadam.leventhal@sun.com 		/*
1521*10105Sadam.leventhal@sun.com 		 * Initialize the targets array by finding the first n columns
1522*10105Sadam.leventhal@sun.com 		 * that contain no error.
1523*10105Sadam.leventhal@sun.com 		 *
1524*10105Sadam.leventhal@sun.com 		 * If there were no data errors, we need to ensure that we're
1525*10105Sadam.leventhal@sun.com 		 * always explicitly attempting to reconstruct at least one
1526*10105Sadam.leventhal@sun.com 		 * data column. To do this, we simply push the highest target
1527*10105Sadam.leventhal@sun.com 		 * up into the data columns.
1528*10105Sadam.leventhal@sun.com 		 */
1529*10105Sadam.leventhal@sun.com 		for (c = 0, i = 0; i < n; i++) {
1530*10105Sadam.leventhal@sun.com 			if (i == n - 1 && data_errors == 0 &&
1531*10105Sadam.leventhal@sun.com 			    c < rm->rm_firstdatacol) {
1532*10105Sadam.leventhal@sun.com 				c = rm->rm_firstdatacol;
1533*10105Sadam.leventhal@sun.com 			}
1534*10105Sadam.leventhal@sun.com 
1535*10105Sadam.leventhal@sun.com 			while (rm->rm_col[c].rc_error != 0) {
1536*10105Sadam.leventhal@sun.com 				c++;
1537*10105Sadam.leventhal@sun.com 				ASSERT3S(c, <, rm->rm_cols);
1538*10105Sadam.leventhal@sun.com 			}
1539*10105Sadam.leventhal@sun.com 
1540*10105Sadam.leventhal@sun.com 			tgts[i] = c++;
1541*10105Sadam.leventhal@sun.com 		}
1542*10105Sadam.leventhal@sun.com 
1543*10105Sadam.leventhal@sun.com 		/*
1544*10105Sadam.leventhal@sun.com 		 * Setting tgts[n] simplifies the other edge condition.
1545*10105Sadam.leventhal@sun.com 		 */
1546*10105Sadam.leventhal@sun.com 		tgts[n] = rm->rm_cols;
1547*10105Sadam.leventhal@sun.com 
1548*10105Sadam.leventhal@sun.com 		/*
1549*10105Sadam.leventhal@sun.com 		 * These buffers were allocated in previous iterations.
1550*10105Sadam.leventhal@sun.com 		 */
1551*10105Sadam.leventhal@sun.com 		for (i = 0; i < n - 1; i++) {
1552*10105Sadam.leventhal@sun.com 			ASSERT(orig[i] != NULL);
1553*10105Sadam.leventhal@sun.com 		}
1554*10105Sadam.leventhal@sun.com 
1555*10105Sadam.leventhal@sun.com 		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1556*10105Sadam.leventhal@sun.com 
1557*10105Sadam.leventhal@sun.com 		current = 0;
1558*10105Sadam.leventhal@sun.com 		next = tgts[current];
1559*10105Sadam.leventhal@sun.com 
1560*10105Sadam.leventhal@sun.com 		while (current != n) {
1561*10105Sadam.leventhal@sun.com 			tgts[current] = next;
1562*10105Sadam.leventhal@sun.com 			current = 0;
1563*10105Sadam.leventhal@sun.com 
1564*10105Sadam.leventhal@sun.com 			/*
1565*10105Sadam.leventhal@sun.com 			 * Save off the original data that we're going to
1566*10105Sadam.leventhal@sun.com 			 * attempt to reconstruct.
1567*10105Sadam.leventhal@sun.com 			 */
1568*10105Sadam.leventhal@sun.com 			for (i = 0; i < n; i++) {
1569*10105Sadam.leventhal@sun.com 				ASSERT(orig[i] != NULL);
1570*10105Sadam.leventhal@sun.com 				c = tgts[i];
1571*10105Sadam.leventhal@sun.com 				ASSERT3S(c, >=, 0);
1572*10105Sadam.leventhal@sun.com 				ASSERT3S(c, <, rm->rm_cols);
1573*10105Sadam.leventhal@sun.com 				rc = &rm->rm_col[c];
1574*10105Sadam.leventhal@sun.com 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1575*10105Sadam.leventhal@sun.com 			}
1576*10105Sadam.leventhal@sun.com 
1577*10105Sadam.leventhal@sun.com 			/*
1578*10105Sadam.leventhal@sun.com 			 * Attempt a reconstruction and exit the outer loop on
1579*10105Sadam.leventhal@sun.com 			 * success.
1580*10105Sadam.leventhal@sun.com 			 */
1581*10105Sadam.leventhal@sun.com 			code = vdev_raidz_reconstruct(rm, tgts, n);
1582*10105Sadam.leventhal@sun.com 			if (zio_checksum_error(zio) == 0) {
1583*10105Sadam.leventhal@sun.com 				atomic_inc_64(&raidz_corrected[code]);
1584*10105Sadam.leventhal@sun.com 
1585*10105Sadam.leventhal@sun.com 				for (i = 0; i < n; i++) {
1586*10105Sadam.leventhal@sun.com 					c = tgts[i];
1587*10105Sadam.leventhal@sun.com 					rc = &rm->rm_col[c];
1588*10105Sadam.leventhal@sun.com 					ASSERT(rc->rc_error == 0);
1589*10105Sadam.leventhal@sun.com 					if (rc->rc_tried)
1590*10105Sadam.leventhal@sun.com 						raidz_checksum_error(zio, rc);
1591*10105Sadam.leventhal@sun.com 					rc->rc_error = ECKSUM;
1592*10105Sadam.leventhal@sun.com 				}
1593*10105Sadam.leventhal@sun.com 
1594*10105Sadam.leventhal@sun.com 				ret = code;
1595*10105Sadam.leventhal@sun.com 				goto done;
1596*10105Sadam.leventhal@sun.com 			}
1597*10105Sadam.leventhal@sun.com 
1598*10105Sadam.leventhal@sun.com 			/*
1599*10105Sadam.leventhal@sun.com 			 * Restore the original data.
1600*10105Sadam.leventhal@sun.com 			 */
1601*10105Sadam.leventhal@sun.com 			for (i = 0; i < n; i++) {
1602*10105Sadam.leventhal@sun.com 				c = tgts[i];
1603*10105Sadam.leventhal@sun.com 				rc = &rm->rm_col[c];
1604*10105Sadam.leventhal@sun.com 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1605*10105Sadam.leventhal@sun.com 			}
1606*10105Sadam.leventhal@sun.com 
1607*10105Sadam.leventhal@sun.com 			do {
1608*10105Sadam.leventhal@sun.com 				/*
1609*10105Sadam.leventhal@sun.com 				 * Find the next valid column after the current
1610*10105Sadam.leventhal@sun.com 				 * position..
1611*10105Sadam.leventhal@sun.com 				 */
1612*10105Sadam.leventhal@sun.com 				for (next = tgts[current] + 1;
1613*10105Sadam.leventhal@sun.com 				    next < rm->rm_cols &&
1614*10105Sadam.leventhal@sun.com 				    rm->rm_col[next].rc_error != 0; next++)
1615*10105Sadam.leventhal@sun.com 					continue;
1616*10105Sadam.leventhal@sun.com 
1617*10105Sadam.leventhal@sun.com 				ASSERT(next <= tgts[current + 1]);
1618*10105Sadam.leventhal@sun.com 
1619*10105Sadam.leventhal@sun.com 				/*
1620*10105Sadam.leventhal@sun.com 				 * If that spot is available, we're done here.
1621*10105Sadam.leventhal@sun.com 				 */
1622*10105Sadam.leventhal@sun.com 				if (next != tgts[current + 1])
1623*10105Sadam.leventhal@sun.com 					break;
1624*10105Sadam.leventhal@sun.com 
1625*10105Sadam.leventhal@sun.com 				/*
1626*10105Sadam.leventhal@sun.com 				 * Otherwise, find the next valid column after
1627*10105Sadam.leventhal@sun.com 				 * the previous position.
1628*10105Sadam.leventhal@sun.com 				 */
1629*10105Sadam.leventhal@sun.com 				for (c = tgts[current - 1] + 1;
1630*10105Sadam.leventhal@sun.com 				    rm->rm_col[c].rc_error != 0; c++)
1631*10105Sadam.leventhal@sun.com 					continue;
1632*10105Sadam.leventhal@sun.com 
1633*10105Sadam.leventhal@sun.com 				tgts[current] = c;
1634*10105Sadam.leventhal@sun.com 				current++;
1635*10105Sadam.leventhal@sun.com 
1636*10105Sadam.leventhal@sun.com 			} while (current != n);
1637*10105Sadam.leventhal@sun.com 		}
1638*10105Sadam.leventhal@sun.com 	}
1639*10105Sadam.leventhal@sun.com 	n--;
1640*10105Sadam.leventhal@sun.com done:
1641*10105Sadam.leventhal@sun.com 	for (i = 0; i < n; i++) {
1642*10105Sadam.leventhal@sun.com 		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1643*10105Sadam.leventhal@sun.com 	}
1644*10105Sadam.leventhal@sun.com 
1645*10105Sadam.leventhal@sun.com 	return (ret);
1646*10105Sadam.leventhal@sun.com }
1647*10105Sadam.leventhal@sun.com 
16487754SJeff.Bonwick@Sun.COM static void
1649789Sahrens vdev_raidz_io_done(zio_t *zio)
1650789Sahrens {
1651789Sahrens 	vdev_t *vd = zio->io_vd;
1652789Sahrens 	vdev_t *cvd;
1653789Sahrens 	raidz_map_t *rm = zio->io_vsd;
1654*10105Sadam.leventhal@sun.com 	raidz_col_t *rc;
1655789Sahrens 	int unexpected_errors = 0;
16562082Seschrock 	int parity_errors = 0;
16573456Sahl 	int parity_untried = 0;
16582082Seschrock 	int data_errors = 0;
16597754SJeff.Bonwick@Sun.COM 	int total_errors = 0;
1660*10105Sadam.leventhal@sun.com 	int n, c;
1661*10105Sadam.leventhal@sun.com 	int tgts[VDEV_RAIDZ_MAXPARITY];
1662*10105Sadam.leventhal@sun.com 	int code;
1663789Sahrens 
16641775Sbillm 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
1665789Sahrens 
16662082Seschrock 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
16672082Seschrock 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
16682082Seschrock 
1669789Sahrens 	for (c = 0; c < rm->rm_cols; c++) {
1670789Sahrens 		rc = &rm->rm_col[c];
1671789Sahrens 
1672789Sahrens 		if (rc->rc_error) {
16737754SJeff.Bonwick@Sun.COM 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
16742082Seschrock 
16752082Seschrock 			if (c < rm->rm_firstdatacol)
16762082Seschrock 				parity_errors++;
16772082Seschrock 			else
16782082Seschrock 				data_errors++;
16792082Seschrock 
1680789Sahrens 			if (!rc->rc_skipped)
1681789Sahrens 				unexpected_errors++;
16822082Seschrock 
16837754SJeff.Bonwick@Sun.COM 			total_errors++;
16843456Sahl 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
16853456Sahl 			parity_untried++;
1686789Sahrens 		}
1687789Sahrens 	}
1688789Sahrens 
1689789Sahrens 	if (zio->io_type == ZIO_TYPE_WRITE) {
1690789Sahrens 		/*
16917754SJeff.Bonwick@Sun.COM 		 * XXX -- for now, treat partial writes as a success.
16927754SJeff.Bonwick@Sun.COM 		 * (If we couldn't write enough columns to reconstruct
16937754SJeff.Bonwick@Sun.COM 		 * the data, the I/O failed.  Otherwise, good enough.)
16947754SJeff.Bonwick@Sun.COM 		 *
16957754SJeff.Bonwick@Sun.COM 		 * Now that we support write reallocation, it would be better
16967754SJeff.Bonwick@Sun.COM 		 * to treat partial failure as real failure unless there are
16977754SJeff.Bonwick@Sun.COM 		 * no non-degraded top-level vdevs left, and not update DTLs
16987754SJeff.Bonwick@Sun.COM 		 * if we intend to reallocate.
1699789Sahrens 		 */
1700789Sahrens 		/* XXPOLICY */
17017754SJeff.Bonwick@Sun.COM 		if (total_errors > rm->rm_firstdatacol)
17027754SJeff.Bonwick@Sun.COM 			zio->io_error = vdev_raidz_worst_error(rm);
1703789Sahrens 
17047754SJeff.Bonwick@Sun.COM 		return;
1705789Sahrens 	}
1706789Sahrens 
1707789Sahrens 	ASSERT(zio->io_type == ZIO_TYPE_READ);
17082082Seschrock 	/*
17092082Seschrock 	 * There are three potential phases for a read:
17102082Seschrock 	 *	1. produce valid data from the columns read
17112082Seschrock 	 *	2. read all disks and try again
17122082Seschrock 	 *	3. perform combinatorial reconstruction
17132082Seschrock 	 *
17142082Seschrock 	 * Each phase is progressively both more expensive and less likely to
17152082Seschrock 	 * occur. If we encounter more errors than we can repair or all phases
17162082Seschrock 	 * fail, we have no choice but to return an error.
17172082Seschrock 	 */
1718789Sahrens 
1719789Sahrens 	/*
17202082Seschrock 	 * If the number of errors we saw was correctable -- less than or equal
17213456Sahl 	 * to the number of parity disks read -- attempt to produce data that
17223456Sahl 	 * has a valid checksum. Naturally, this case applies in the absence of
17233456Sahl 	 * any errors.
1724789Sahrens 	 */
17257754SJeff.Bonwick@Sun.COM 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1726*10105Sadam.leventhal@sun.com 		if (data_errors == 0) {
17272082Seschrock 			if (zio_checksum_error(zio) == 0) {
17284034Sahl 				/*
17294034Sahl 				 * If we read parity information (unnecessarily
17304034Sahl 				 * as it happens since no reconstruction was
17314034Sahl 				 * needed) regenerate and verify the parity.
17324034Sahl 				 * We also regenerate parity when resilvering
17334034Sahl 				 * so we can write it out to the failed device
17344034Sahl 				 * later.
17354034Sahl 				 */
17363456Sahl 				if (parity_errors + parity_untried <
17374034Sahl 				    rm->rm_firstdatacol ||
17384034Sahl 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
17393456Sahl 					n = raidz_parity_verify(zio, rm);
17403456Sahl 					unexpected_errors += n;
17413456Sahl 					ASSERT(parity_errors + n <=
17423456Sahl 					    rm->rm_firstdatacol);
17433456Sahl 				}
17442082Seschrock 				goto done;
17452082Seschrock 			}
1746*10105Sadam.leventhal@sun.com 		} else {
17473456Sahl 			/*
17483456Sahl 			 * We either attempt to read all the parity columns or
17493456Sahl 			 * none of them. If we didn't try to read parity, we
17503456Sahl 			 * wouldn't be here in the correctable case. There must
17513456Sahl 			 * also have been fewer parity errors than parity
17523456Sahl 			 * columns or, again, we wouldn't be in this code path.
17533456Sahl 			 */
17543456Sahl 			ASSERT(parity_untried == 0);
17552082Seschrock 			ASSERT(parity_errors < rm->rm_firstdatacol);
17562082Seschrock 
17572082Seschrock 			/*
1758*10105Sadam.leventhal@sun.com 			 * Identify the data columns that reported an error.
17592082Seschrock 			 */
1760*10105Sadam.leventhal@sun.com 			n = 0;
17612082Seschrock 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
17622082Seschrock 				rc = &rm->rm_col[c];
1763*10105Sadam.leventhal@sun.com 				if (rc->rc_error != 0) {
1764*10105Sadam.leventhal@sun.com 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1765*10105Sadam.leventhal@sun.com 					tgts[n++] = c;
1766*10105Sadam.leventhal@sun.com 				}
17672082Seschrock 			}
17682082Seschrock 
1769*10105Sadam.leventhal@sun.com 			ASSERT(rm->rm_firstdatacol >= n);
1770*10105Sadam.leventhal@sun.com 
1771*10105Sadam.leventhal@sun.com 			code = vdev_raidz_reconstruct(rm, tgts, n);
17722082Seschrock 
17732082Seschrock 			if (zio_checksum_error(zio) == 0) {
1774*10105Sadam.leventhal@sun.com 				atomic_inc_64(&raidz_corrected[code]);
1775789Sahrens 
17762082Seschrock 				/*
1777*10105Sadam.leventhal@sun.com 				 * If we read more parity disks than were used
1778*10105Sadam.leventhal@sun.com 				 * for reconstruction, confirm that the other
1779*10105Sadam.leventhal@sun.com 				 * parity disks produced correct data. This
1780*10105Sadam.leventhal@sun.com 				 * routine is suboptimal in that it regenerates
1781*10105Sadam.leventhal@sun.com 				 * the parity that we already used in addition
1782*10105Sadam.leventhal@sun.com 				 * to the parity that we're attempting to
1783*10105Sadam.leventhal@sun.com 				 * verify, but this should be a relatively
1784*10105Sadam.leventhal@sun.com 				 * uncommon case, and can be optimized if it
1785*10105Sadam.leventhal@sun.com 				 * becomes a problem. Note that we regenerate
1786*10105Sadam.leventhal@sun.com 				 * parity when resilvering so we can write it
1787*10105Sadam.leventhal@sun.com 				 * out to failed devices later.
17882082Seschrock 				 */
1789*10105Sadam.leventhal@sun.com 				if (parity_errors < rm->rm_firstdatacol - n ||
17904034Sahl 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
17912082Seschrock 					n = raidz_parity_verify(zio, rm);
17922082Seschrock 					unexpected_errors += n;
17932082Seschrock 					ASSERT(parity_errors + n <=
17942082Seschrock 					    rm->rm_firstdatacol);
17952082Seschrock 				}
17962082Seschrock 
17972082Seschrock 				goto done;
17982082Seschrock 			}
1799789Sahrens 		}
1800789Sahrens 	}
1801789Sahrens 
1802789Sahrens 	/*
18032082Seschrock 	 * This isn't a typical situation -- either we got a read error or
18042082Seschrock 	 * a child silently returned bad data. Read every block so we can
18052082Seschrock 	 * try again with as much data and parity as we can track down. If
18062082Seschrock 	 * we've already been through once before, all children will be marked
18072082Seschrock 	 * as tried so we'll proceed to combinatorial reconstruction.
1808789Sahrens 	 */
1809789Sahrens 	unexpected_errors = 1;
18102082Seschrock 	rm->rm_missingdata = 0;
18112082Seschrock 	rm->rm_missingparity = 0;
1812789Sahrens 
18132082Seschrock 	for (c = 0; c < rm->rm_cols; c++) {
18142082Seschrock 		if (rm->rm_col[c].rc_tried)
18152082Seschrock 			continue;
1816789Sahrens 
1817789Sahrens 		zio_vdev_io_redone(zio);
18182082Seschrock 		do {
1819789Sahrens 			rc = &rm->rm_col[c];
1820789Sahrens 			if (rc->rc_tried)
1821789Sahrens 				continue;
1822789Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL,
18232082Seschrock 			    vd->vdev_child[rc->rc_devidx],
1824789Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
18257754SJeff.Bonwick@Sun.COM 			    zio->io_type, zio->io_priority, 0,
1826789Sahrens 			    vdev_raidz_child_done, rc));
18272082Seschrock 		} while (++c < rm->rm_cols);
18285530Sbonwick 
18297754SJeff.Bonwick@Sun.COM 		return;
1830789Sahrens 	}
1831789Sahrens 
1832789Sahrens 	/*
18332082Seschrock 	 * At this point we've attempted to reconstruct the data given the
18342082Seschrock 	 * errors we detected, and we've attempted to read all columns. There
18352082Seschrock 	 * must, therefore, be one or more additional problems -- silent errors
18362082Seschrock 	 * resulting in invalid data rather than explicit I/O errors resulting
1837*10105Sadam.leventhal@sun.com 	 * in absent data. We check if there is enough additional data to
1838*10105Sadam.leventhal@sun.com 	 * possibly reconstruct the data and then perform combinatorial
1839*10105Sadam.leventhal@sun.com 	 * reconstruction over all possible combinations. If that fails,
1840*10105Sadam.leventhal@sun.com 	 * we're cooked.
1841789Sahrens 	 */
18427754SJeff.Bonwick@Sun.COM 	if (total_errors >= rm->rm_firstdatacol) {
18437754SJeff.Bonwick@Sun.COM 		zio->io_error = vdev_raidz_worst_error(rm);
18447754SJeff.Bonwick@Sun.COM 		/*
18457754SJeff.Bonwick@Sun.COM 		 * If there were exactly as many device errors as parity
18467754SJeff.Bonwick@Sun.COM 		 * columns, yet we couldn't reconstruct the data, then at
18477754SJeff.Bonwick@Sun.COM 		 * least one device must have returned bad data silently.
18487754SJeff.Bonwick@Sun.COM 		 */
18497754SJeff.Bonwick@Sun.COM 		if (total_errors == rm->rm_firstdatacol)
18507754SJeff.Bonwick@Sun.COM 			zio->io_error = zio_worst_error(zio->io_error, ECKSUM);
18512082Seschrock 
1852*10105Sadam.leventhal@sun.com 	} else if ((code = vdev_raidz_combrec(zio, total_errors,
1853*10105Sadam.leventhal@sun.com 	    data_errors)) != 0) {
18542082Seschrock 		/*
1855*10105Sadam.leventhal@sun.com 		 * If we didn't use all the available parity for the
1856*10105Sadam.leventhal@sun.com 		 * combinatorial reconstruction, verify that the remaining
1857*10105Sadam.leventhal@sun.com 		 * parity is correct.
18582082Seschrock 		 */
1859*10105Sadam.leventhal@sun.com 		if (code != (1 << rm->rm_firstdatacol) - 1)
1860*10105Sadam.leventhal@sun.com 			(void) raidz_parity_verify(zio, rm);
1861*10105Sadam.leventhal@sun.com 	} else {
1862*10105Sadam.leventhal@sun.com 		/*
1863*10105Sadam.leventhal@sun.com 		 * All combinations failed to checksum. Generate checksum
1864*10105Sadam.leventhal@sun.com 		 * ereports for all children.
1865*10105Sadam.leventhal@sun.com 		 */
1866*10105Sadam.leventhal@sun.com 		zio->io_error = ECKSUM;
18672082Seschrock 
1868*10105Sadam.leventhal@sun.com 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1869*10105Sadam.leventhal@sun.com 			for (c = 0; c < rm->rm_cols; c++) {
1870*10105Sadam.leventhal@sun.com 				rc = &rm->rm_col[c];
1871*10105Sadam.leventhal@sun.com 				zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
1872*10105Sadam.leventhal@sun.com 				    zio->io_spa, vd->vdev_child[rc->rc_devidx],
1873*10105Sadam.leventhal@sun.com 				    zio, rc->rc_offset, rc->rc_size);
18742082Seschrock 			}
18751544Seschrock 		}
18761544Seschrock 	}
1877789Sahrens 
1878789Sahrens done:
1879789Sahrens 	zio_checksum_verified(zio);
1880789Sahrens 
18818241SJeff.Bonwick@Sun.COM 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
1882789Sahrens 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
1883789Sahrens 		/*
1884789Sahrens 		 * Use the good data we have in hand to repair damaged children.
1885789Sahrens 		 */
1886789Sahrens 		for (c = 0; c < rm->rm_cols; c++) {
1887789Sahrens 			rc = &rm->rm_col[c];
18882082Seschrock 			cvd = vd->vdev_child[rc->rc_devidx];
1889789Sahrens 
18901732Sbonwick 			if (rc->rc_error == 0)
18911732Sbonwick 				continue;
18921732Sbonwick 
18937754SJeff.Bonwick@Sun.COM 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
18941732Sbonwick 			    rc->rc_offset, rc->rc_data, rc->rc_size,
18951732Sbonwick 			    ZIO_TYPE_WRITE, zio->io_priority,
18968241SJeff.Bonwick@Sun.COM 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
18978241SJeff.Bonwick@Sun.COM 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
18981732Sbonwick 		}
1899789Sahrens 	}
1900789Sahrens }
1901789Sahrens 
1902789Sahrens static void
1903789Sahrens vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
1904789Sahrens {
19052082Seschrock 	if (faulted > vd->vdev_nparity)
19061544Seschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
19071544Seschrock 		    VDEV_AUX_NO_REPLICAS);
1908789Sahrens 	else if (degraded + faulted != 0)
19091544Seschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
1910789Sahrens 	else
19111544Seschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
1912789Sahrens }
1913789Sahrens 
1914789Sahrens vdev_ops_t vdev_raidz_ops = {
1915789Sahrens 	vdev_raidz_open,
1916789Sahrens 	vdev_raidz_close,
1917789Sahrens 	vdev_raidz_asize,
1918789Sahrens 	vdev_raidz_io_start,
1919789Sahrens 	vdev_raidz_io_done,
1920789Sahrens 	vdev_raidz_state_change,
1921789Sahrens 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
1922789Sahrens 	B_FALSE			/* not a leaf vdev */
1923789Sahrens };
1924