xref: /onnv-gate/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision 12296:7cf402a7f374)
1789Sahrens /*
2789Sahrens  * CDDL HEADER START
3789Sahrens  *
4789Sahrens  * The contents of this file are subject to the terms of the
51544Seschrock  * Common Development and Distribution License (the "License").
61544Seschrock  * You may not use this file except in compliance with the License.
7789Sahrens  *
8789Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9789Sahrens  * or http://www.opensolaris.org/os/licensing.
10789Sahrens  * See the License for the specific language governing permissions
11789Sahrens  * and limitations under the License.
12789Sahrens  *
13789Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14789Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15789Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16789Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17789Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18789Sahrens  *
19789Sahrens  * CDDL HEADER END
20789Sahrens  */
212082Seschrock 
22789Sahrens /*
23*12296SLin.Ling@Sun.COM  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24789Sahrens  */
25789Sahrens 
26789Sahrens #include <sys/zfs_context.h>
27789Sahrens #include <sys/spa.h>
28789Sahrens #include <sys/vdev_impl.h>
29789Sahrens #include <sys/zio.h>
30789Sahrens #include <sys/zio_checksum.h>
31789Sahrens #include <sys/fs/zfs.h>
321544Seschrock #include <sys/fm/fs/zfs.h>
33789Sahrens 
34789Sahrens /*
35789Sahrens  * Virtual device vector for RAID-Z.
362082Seschrock  *
3710105Sadam.leventhal@sun.com  * This vdev supports single, double, and triple parity. For single parity,
3810105Sadam.leventhal@sun.com  * we use a simple XOR of all the data columns. For double or triple parity,
3910105Sadam.leventhal@sun.com  * we use a special case of Reed-Solomon coding. This extends the
4010105Sadam.leventhal@sun.com  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
4110105Sadam.leventhal@sun.com  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
4210105Sadam.leventhal@sun.com  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
4310105Sadam.leventhal@sun.com  * former is also based. The latter is designed to provide higher performance
4410105Sadam.leventhal@sun.com  * for writes.
4510105Sadam.leventhal@sun.com  *
4610105Sadam.leventhal@sun.com  * Note that the Plank paper claimed to support arbitrary N+M, but was then
4710105Sadam.leventhal@sun.com  * amended six years later identifying a critical flaw that invalidates its
4810105Sadam.leventhal@sun.com  * claims. Nevertheless, the technique can be adapted to work for up to
4910105Sadam.leventhal@sun.com  * triple parity. For additional parity, the amendment "Note: Correction to
5010105Sadam.leventhal@sun.com  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
5110105Sadam.leventhal@sun.com  * is viable, but the additional complexity means that write performance will
5210105Sadam.leventhal@sun.com  * suffer.
5310105Sadam.leventhal@sun.com  *
5410105Sadam.leventhal@sun.com  * All of the methods above operate on a Galois field, defined over the
5510105Sadam.leventhal@sun.com  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
5610105Sadam.leventhal@sun.com  * can be expressed with a single byte. Briefly, the operations on the
5710105Sadam.leventhal@sun.com  * field are defined as follows:
582082Seschrock  *
592082Seschrock  *   o addition (+) is represented by a bitwise XOR
602082Seschrock  *   o subtraction (-) is therefore identical to addition: A + B = A - B
612082Seschrock  *   o multiplication of A by 2 is defined by the following bitwise expression:
622082Seschrock  *	(A * 2)_7 = A_6
632082Seschrock  *	(A * 2)_6 = A_5
642082Seschrock  *	(A * 2)_5 = A_4
652082Seschrock  *	(A * 2)_4 = A_3 + A_7
662082Seschrock  *	(A * 2)_3 = A_2 + A_7
672082Seschrock  *	(A * 2)_2 = A_1 + A_7
682082Seschrock  *	(A * 2)_1 = A_0
692082Seschrock  *	(A * 2)_0 = A_7
702082Seschrock  *
712082Seschrock  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
7210105Sadam.leventhal@sun.com  * As an aside, this multiplication is derived from the error correcting
7310105Sadam.leventhal@sun.com  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
742082Seschrock  *
752082Seschrock  * Observe that any number in the field (except for 0) can be expressed as a
762082Seschrock  * power of 2 -- a generator for the field. We store a table of the powers of
772082Seschrock  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
782082Seschrock  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
7910105Sadam.leventhal@sun.com  * than field addition). The inverse of a field element A (A^-1) is therefore
8010105Sadam.leventhal@sun.com  * A ^ (255 - 1) = A^254.
812082Seschrock  *
8210105Sadam.leventhal@sun.com  * The up-to-three parity columns, P, Q, R over several data columns,
8310105Sadam.leventhal@sun.com  * D_0, ... D_n-1, can be expressed by field operations:
842082Seschrock  *
852082Seschrock  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
862082Seschrock  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
872082Seschrock  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
8810105Sadam.leventhal@sun.com  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
8910105Sadam.leventhal@sun.com  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
902082Seschrock  *
9110105Sadam.leventhal@sun.com  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
9210105Sadam.leventhal@sun.com  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
9310105Sadam.leventhal@sun.com  * independent coefficients. (There are no additional coefficients that have
9410105Sadam.leventhal@sun.com  * this property which is why the uncorrected Plank method breaks down.)
9510105Sadam.leventhal@sun.com  *
9610105Sadam.leventhal@sun.com  * See the reconstruction code below for how P, Q and R can used individually
9710105Sadam.leventhal@sun.com  * or in concert to recover missing data columns.
98789Sahrens  */
99789Sahrens 
100789Sahrens typedef struct raidz_col {
1012082Seschrock 	uint64_t rc_devidx;		/* child device index for I/O */
1022082Seschrock 	uint64_t rc_offset;		/* device offset */
1032082Seschrock 	uint64_t rc_size;		/* I/O size */
1042082Seschrock 	void *rc_data;			/* I/O data */
10510614SJonathan.Adams@Sun.COM 	void *rc_gdata;			/* used to store the "good" version */
1062082Seschrock 	int rc_error;			/* I/O error for this device */
1072082Seschrock 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
1082082Seschrock 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
109789Sahrens } raidz_col_t;
110789Sahrens 
111789Sahrens typedef struct raidz_map {
11210105Sadam.leventhal@sun.com 	uint64_t rm_cols;		/* Regular column count */
11310105Sadam.leventhal@sun.com 	uint64_t rm_scols;		/* Count including skipped columns */
1142082Seschrock 	uint64_t rm_bigcols;		/* Number of oversized columns */
1152082Seschrock 	uint64_t rm_asize;		/* Actual total I/O size */
1162082Seschrock 	uint64_t rm_missingdata;	/* Count of missing data devices */
1172082Seschrock 	uint64_t rm_missingparity;	/* Count of missing parity devices */
1182082Seschrock 	uint64_t rm_firstdatacol;	/* First data column/parity count */
11910450Sadam.leventhal@sun.com 	uint64_t rm_nskip;		/* Skipped sectors for padding */
12010450Sadam.leventhal@sun.com 	uint64_t rm_skipstart;	/* Column index of padding start */
12110614SJonathan.Adams@Sun.COM 	void *rm_datacopy;		/* rm_asize-buffer of copied data */
12210614SJonathan.Adams@Sun.COM 	uintptr_t rm_reports;		/* # of referencing checksum reports */
12310614SJonathan.Adams@Sun.COM 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
12410614SJonathan.Adams@Sun.COM 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
1252082Seschrock 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
126789Sahrens } raidz_map_t;
127789Sahrens 
1282082Seschrock #define	VDEV_RAIDZ_P		0
1292082Seschrock #define	VDEV_RAIDZ_Q		1
13010105Sadam.leventhal@sun.com #define	VDEV_RAIDZ_R		2
1312082Seschrock 
13210105Sadam.leventhal@sun.com #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
13310105Sadam.leventhal@sun.com #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
1342082Seschrock 
13510105Sadam.leventhal@sun.com /*
13610105Sadam.leventhal@sun.com  * We provide a mechanism to perform the field multiplication operation on a
13710105Sadam.leventhal@sun.com  * 64-bit value all at once rather than a byte at a time. This works by
13810105Sadam.leventhal@sun.com  * creating a mask from the top bit in each byte and using that to
13910105Sadam.leventhal@sun.com  * conditionally apply the XOR of 0x1d.
14010105Sadam.leventhal@sun.com  */
14110105Sadam.leventhal@sun.com #define	VDEV_RAIDZ_64MUL_2(x, mask) \
14210105Sadam.leventhal@sun.com { \
14310105Sadam.leventhal@sun.com 	(mask) = (x) & 0x8080808080808080ULL; \
14410105Sadam.leventhal@sun.com 	(mask) = ((mask) << 1) - ((mask) >> 7); \
14510105Sadam.leventhal@sun.com 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
14610105Sadam.leventhal@sun.com 	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
14710105Sadam.leventhal@sun.com }
14810105Sadam.leventhal@sun.com 
14910105Sadam.leventhal@sun.com #define	VDEV_RAIDZ_64MUL_4(x, mask) \
15010105Sadam.leventhal@sun.com { \
15110105Sadam.leventhal@sun.com 	VDEV_RAIDZ_64MUL_2((x), mask); \
15210105Sadam.leventhal@sun.com 	VDEV_RAIDZ_64MUL_2((x), mask); \
15310105Sadam.leventhal@sun.com }
15410105Sadam.leventhal@sun.com 
15510105Sadam.leventhal@sun.com /*
15610105Sadam.leventhal@sun.com  * Force reconstruction to use the general purpose method.
15710105Sadam.leventhal@sun.com  */
15810105Sadam.leventhal@sun.com int vdev_raidz_default_to_general;
1592082Seschrock 
1602082Seschrock /*
1612082Seschrock  * These two tables represent powers and logs of 2 in the Galois field defined
1622082Seschrock  * above. These values were computed by repeatedly multiplying by 2 as above.
1632082Seschrock  */
1642082Seschrock static const uint8_t vdev_raidz_pow2[256] = {
1652082Seschrock 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
1662082Seschrock 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
1672082Seschrock 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
1682082Seschrock 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
1692082Seschrock 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
1702082Seschrock 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
1712082Seschrock 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
1722082Seschrock 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
1732082Seschrock 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
1742082Seschrock 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
1752082Seschrock 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
1762082Seschrock 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
1772082Seschrock 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
1782082Seschrock 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
1792082Seschrock 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
1802082Seschrock 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
1812082Seschrock 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
1822082Seschrock 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
1832082Seschrock 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
1842082Seschrock 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
1852082Seschrock 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
1862082Seschrock 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
1872082Seschrock 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
1882082Seschrock 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
1892082Seschrock 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
1902082Seschrock 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
1912082Seschrock 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
1922082Seschrock 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
1932082Seschrock 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
1942082Seschrock 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
1952082Seschrock 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
1962082Seschrock 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
1972082Seschrock };
1982082Seschrock static const uint8_t vdev_raidz_log2[256] = {
1992082Seschrock 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
2002082Seschrock 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
2012082Seschrock 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
2022082Seschrock 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
2032082Seschrock 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
2042082Seschrock 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
2052082Seschrock 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
2062082Seschrock 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
2072082Seschrock 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
2082082Seschrock 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
2092082Seschrock 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
2102082Seschrock 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
2112082Seschrock 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
2122082Seschrock 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
2132082Seschrock 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
2142082Seschrock 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
2152082Seschrock 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
2162082Seschrock 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
2172082Seschrock 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
2182082Seschrock 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
2192082Seschrock 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
2202082Seschrock 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
2212082Seschrock 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
2222082Seschrock 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
2232082Seschrock 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
2242082Seschrock 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
2252082Seschrock 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
2262082Seschrock 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
2272082Seschrock 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
2282082Seschrock 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
2292082Seschrock 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
2302082Seschrock 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
2312082Seschrock };
2322082Seschrock 
23310614SJonathan.Adams@Sun.COM static void vdev_raidz_generate_parity(raidz_map_t *rm);
23410614SJonathan.Adams@Sun.COM 
2352082Seschrock /*
2362082Seschrock  * Multiply a given number by 2 raised to the given power.
2372082Seschrock  */
2382082Seschrock static uint8_t
vdev_raidz_exp2(uint_t a,int exp)2392082Seschrock vdev_raidz_exp2(uint_t a, int exp)
2402082Seschrock {
2412082Seschrock 	if (a == 0)
2422082Seschrock 		return (0);
2432082Seschrock 
2442082Seschrock 	ASSERT(exp >= 0);
2452082Seschrock 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
2462082Seschrock 
2472082Seschrock 	exp += vdev_raidz_log2[a];
2482082Seschrock 	if (exp > 255)
2492082Seschrock 		exp -= 255;
2502082Seschrock 
2512082Seschrock 	return (vdev_raidz_pow2[exp]);
2522082Seschrock }
2532082Seschrock 
2547754SJeff.Bonwick@Sun.COM static void
vdev_raidz_map_free(raidz_map_t * rm)25510614SJonathan.Adams@Sun.COM vdev_raidz_map_free(raidz_map_t *rm)
2567754SJeff.Bonwick@Sun.COM {
2577754SJeff.Bonwick@Sun.COM 	int c;
25810653SJonathan.Adams@Sun.COM 	size_t size;
2597754SJeff.Bonwick@Sun.COM 
26010614SJonathan.Adams@Sun.COM 	for (c = 0; c < rm->rm_firstdatacol; c++) {
2617754SJeff.Bonwick@Sun.COM 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
2627754SJeff.Bonwick@Sun.COM 
26310614SJonathan.Adams@Sun.COM 		if (rm->rm_col[c].rc_gdata != NULL)
26410614SJonathan.Adams@Sun.COM 			zio_buf_free(rm->rm_col[c].rc_gdata,
26510614SJonathan.Adams@Sun.COM 			    rm->rm_col[c].rc_size);
26610614SJonathan.Adams@Sun.COM 	}
26710614SJonathan.Adams@Sun.COM 
26810653SJonathan.Adams@Sun.COM 	size = 0;
26910653SJonathan.Adams@Sun.COM 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
27010653SJonathan.Adams@Sun.COM 		size += rm->rm_col[c].rc_size;
27110653SJonathan.Adams@Sun.COM 
27210614SJonathan.Adams@Sun.COM 	if (rm->rm_datacopy != NULL)
27310614SJonathan.Adams@Sun.COM 		zio_buf_free(rm->rm_datacopy, size);
27410614SJonathan.Adams@Sun.COM 
27510105Sadam.leventhal@sun.com 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
2767754SJeff.Bonwick@Sun.COM }
2777754SJeff.Bonwick@Sun.COM 
27810614SJonathan.Adams@Sun.COM static void
vdev_raidz_map_free_vsd(zio_t * zio)27910614SJonathan.Adams@Sun.COM vdev_raidz_map_free_vsd(zio_t *zio)
28010614SJonathan.Adams@Sun.COM {
28110614SJonathan.Adams@Sun.COM 	raidz_map_t *rm = zio->io_vsd;
28210614SJonathan.Adams@Sun.COM 
28310614SJonathan.Adams@Sun.COM 	ASSERT3U(rm->rm_freed, ==, 0);
28410614SJonathan.Adams@Sun.COM 	rm->rm_freed = 1;
28510614SJonathan.Adams@Sun.COM 
28610614SJonathan.Adams@Sun.COM 	if (rm->rm_reports == 0)
28710614SJonathan.Adams@Sun.COM 		vdev_raidz_map_free(rm);
28810614SJonathan.Adams@Sun.COM }
28910614SJonathan.Adams@Sun.COM 
29010614SJonathan.Adams@Sun.COM /*ARGSUSED*/
29110614SJonathan.Adams@Sun.COM static void
vdev_raidz_cksum_free(void * arg,size_t ignored)29210614SJonathan.Adams@Sun.COM vdev_raidz_cksum_free(void *arg, size_t ignored)
29310614SJonathan.Adams@Sun.COM {
29410614SJonathan.Adams@Sun.COM 	raidz_map_t *rm = arg;
29510614SJonathan.Adams@Sun.COM 
29610614SJonathan.Adams@Sun.COM 	ASSERT3U(rm->rm_reports, >, 0);
29710614SJonathan.Adams@Sun.COM 
29810653SJonathan.Adams@Sun.COM 	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
29910614SJonathan.Adams@Sun.COM 		vdev_raidz_map_free(rm);
30010614SJonathan.Adams@Sun.COM }
30110614SJonathan.Adams@Sun.COM 
30210614SJonathan.Adams@Sun.COM static void
vdev_raidz_cksum_finish(zio_cksum_report_t * zcr,const void * good_data)30310614SJonathan.Adams@Sun.COM vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
30410614SJonathan.Adams@Sun.COM {
30510614SJonathan.Adams@Sun.COM 	raidz_map_t *rm = zcr->zcr_cbdata;
30610614SJonathan.Adams@Sun.COM 	size_t c = zcr->zcr_cbinfo;
30710614SJonathan.Adams@Sun.COM 	size_t x;
30810614SJonathan.Adams@Sun.COM 
30910614SJonathan.Adams@Sun.COM 	const char *good = NULL;
31010614SJonathan.Adams@Sun.COM 	const char *bad = rm->rm_col[c].rc_data;
31110614SJonathan.Adams@Sun.COM 
31210614SJonathan.Adams@Sun.COM 	if (good_data == NULL) {
31310614SJonathan.Adams@Sun.COM 		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
31410614SJonathan.Adams@Sun.COM 		return;
31510614SJonathan.Adams@Sun.COM 	}
31610614SJonathan.Adams@Sun.COM 
31710614SJonathan.Adams@Sun.COM 	if (c < rm->rm_firstdatacol) {
31810614SJonathan.Adams@Sun.COM 		/*
31910614SJonathan.Adams@Sun.COM 		 * The first time through, calculate the parity blocks for
32010614SJonathan.Adams@Sun.COM 		 * the good data (this relies on the fact that the good
32110614SJonathan.Adams@Sun.COM 		 * data never changes for a given logical ZIO)
32210614SJonathan.Adams@Sun.COM 		 */
32310614SJonathan.Adams@Sun.COM 		if (rm->rm_col[0].rc_gdata == NULL) {
32410614SJonathan.Adams@Sun.COM 			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
32510614SJonathan.Adams@Sun.COM 			char *buf;
32610614SJonathan.Adams@Sun.COM 
32710614SJonathan.Adams@Sun.COM 			/*
32810614SJonathan.Adams@Sun.COM 			 * Set up the rm_col[]s to generate the parity for
32910614SJonathan.Adams@Sun.COM 			 * good_data, first saving the parity bufs and
33010614SJonathan.Adams@Sun.COM 			 * replacing them with buffers to hold the result.
33110614SJonathan.Adams@Sun.COM 			 */
33210614SJonathan.Adams@Sun.COM 			for (x = 0; x < rm->rm_firstdatacol; x++) {
33310614SJonathan.Adams@Sun.COM 				bad_parity[x] = rm->rm_col[x].rc_data;
33410614SJonathan.Adams@Sun.COM 				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
33510614SJonathan.Adams@Sun.COM 				    zio_buf_alloc(rm->rm_col[x].rc_size);
33610614SJonathan.Adams@Sun.COM 			}
33710614SJonathan.Adams@Sun.COM 
33810614SJonathan.Adams@Sun.COM 			/* fill in the data columns from good_data */
33910614SJonathan.Adams@Sun.COM 			buf = (char *)good_data;
34010614SJonathan.Adams@Sun.COM 			for (; x < rm->rm_cols; x++) {
34110614SJonathan.Adams@Sun.COM 				rm->rm_col[x].rc_data = buf;
34210614SJonathan.Adams@Sun.COM 				buf += rm->rm_col[x].rc_size;
34310614SJonathan.Adams@Sun.COM 			}
34410614SJonathan.Adams@Sun.COM 
34510614SJonathan.Adams@Sun.COM 			/*
34610614SJonathan.Adams@Sun.COM 			 * Construct the parity from the good data.
34710614SJonathan.Adams@Sun.COM 			 */
34810614SJonathan.Adams@Sun.COM 			vdev_raidz_generate_parity(rm);
34910614SJonathan.Adams@Sun.COM 
35010614SJonathan.Adams@Sun.COM 			/* restore everything back to its original state */
35110614SJonathan.Adams@Sun.COM 			for (x = 0; x < rm->rm_firstdatacol; x++)
35210614SJonathan.Adams@Sun.COM 				rm->rm_col[x].rc_data = bad_parity[x];
35310614SJonathan.Adams@Sun.COM 
35410614SJonathan.Adams@Sun.COM 			buf = rm->rm_datacopy;
35510614SJonathan.Adams@Sun.COM 			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
35610614SJonathan.Adams@Sun.COM 				rm->rm_col[x].rc_data = buf;
35710614SJonathan.Adams@Sun.COM 				buf += rm->rm_col[x].rc_size;
35810614SJonathan.Adams@Sun.COM 			}
35910614SJonathan.Adams@Sun.COM 		}
36010614SJonathan.Adams@Sun.COM 
36110614SJonathan.Adams@Sun.COM 		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
36210614SJonathan.Adams@Sun.COM 		good = rm->rm_col[c].rc_gdata;
36310614SJonathan.Adams@Sun.COM 	} else {
36410614SJonathan.Adams@Sun.COM 		/* adjust good_data to point at the start of our column */
36510614SJonathan.Adams@Sun.COM 		good = good_data;
36610614SJonathan.Adams@Sun.COM 
36710614SJonathan.Adams@Sun.COM 		for (x = rm->rm_firstdatacol; x < c; x++)
36810614SJonathan.Adams@Sun.COM 			good += rm->rm_col[x].rc_size;
36910614SJonathan.Adams@Sun.COM 	}
37010614SJonathan.Adams@Sun.COM 
37110614SJonathan.Adams@Sun.COM 	/* we drop the ereport if it ends up that the data was good */
37210614SJonathan.Adams@Sun.COM 	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
37310614SJonathan.Adams@Sun.COM }
37410614SJonathan.Adams@Sun.COM 
37510614SJonathan.Adams@Sun.COM /*
37610614SJonathan.Adams@Sun.COM  * Invoked indirectly by zfs_ereport_start_checksum(), called
37710614SJonathan.Adams@Sun.COM  * below when our read operation fails completely.  The main point
37810614SJonathan.Adams@Sun.COM  * is to keep a copy of everything we read from disk, so that at
37910614SJonathan.Adams@Sun.COM  * vdev_raidz_cksum_finish() time we can compare it with the good data.
38010614SJonathan.Adams@Sun.COM  */
38110614SJonathan.Adams@Sun.COM static void
vdev_raidz_cksum_report(zio_t * zio,zio_cksum_report_t * zcr,void * arg)38210614SJonathan.Adams@Sun.COM vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
38310614SJonathan.Adams@Sun.COM {
38410614SJonathan.Adams@Sun.COM 	size_t c = (size_t)(uintptr_t)arg;
38510614SJonathan.Adams@Sun.COM 	caddr_t buf;
38610614SJonathan.Adams@Sun.COM 
38710614SJonathan.Adams@Sun.COM 	raidz_map_t *rm = zio->io_vsd;
38810614SJonathan.Adams@Sun.COM 	size_t size;
38910614SJonathan.Adams@Sun.COM 
39010614SJonathan.Adams@Sun.COM 	/* set up the report and bump the refcount  */
39110614SJonathan.Adams@Sun.COM 	zcr->zcr_cbdata = rm;
39210614SJonathan.Adams@Sun.COM 	zcr->zcr_cbinfo = c;
39310614SJonathan.Adams@Sun.COM 	zcr->zcr_finish = vdev_raidz_cksum_finish;
39410614SJonathan.Adams@Sun.COM 	zcr->zcr_free = vdev_raidz_cksum_free;
39510614SJonathan.Adams@Sun.COM 
39610614SJonathan.Adams@Sun.COM 	rm->rm_reports++;
39710614SJonathan.Adams@Sun.COM 	ASSERT3U(rm->rm_reports, >, 0);
39810614SJonathan.Adams@Sun.COM 
39910653SJonathan.Adams@Sun.COM 	if (rm->rm_datacopy != NULL)
40010614SJonathan.Adams@Sun.COM 		return;
40110614SJonathan.Adams@Sun.COM 
40210614SJonathan.Adams@Sun.COM 	/*
40310653SJonathan.Adams@Sun.COM 	 * It's the first time we're called for this raidz_map_t, so we need
40410653SJonathan.Adams@Sun.COM 	 * to copy the data aside; there's no guarantee that our zio's buffer
40510653SJonathan.Adams@Sun.COM 	 * won't be re-used for something else.
40610614SJonathan.Adams@Sun.COM 	 *
40710653SJonathan.Adams@Sun.COM 	 * Our parity data is already in separate buffers, so there's no need
40810614SJonathan.Adams@Sun.COM 	 * to copy them.
40910614SJonathan.Adams@Sun.COM 	 */
41010614SJonathan.Adams@Sun.COM 
41110653SJonathan.Adams@Sun.COM 	size = 0;
41210653SJonathan.Adams@Sun.COM 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
41310653SJonathan.Adams@Sun.COM 		size += rm->rm_col[c].rc_size;
41410614SJonathan.Adams@Sun.COM 
41510614SJonathan.Adams@Sun.COM 	buf = rm->rm_datacopy = zio_buf_alloc(size);
41610653SJonathan.Adams@Sun.COM 
41710653SJonathan.Adams@Sun.COM 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
41810614SJonathan.Adams@Sun.COM 		raidz_col_t *col = &rm->rm_col[c];
41910614SJonathan.Adams@Sun.COM 
42010614SJonathan.Adams@Sun.COM 		bcopy(col->rc_data, buf, col->rc_size);
42110614SJonathan.Adams@Sun.COM 		col->rc_data = buf;
42210614SJonathan.Adams@Sun.COM 
42310614SJonathan.Adams@Sun.COM 		buf += col->rc_size;
42410614SJonathan.Adams@Sun.COM 	}
42510614SJonathan.Adams@Sun.COM 	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
42610614SJonathan.Adams@Sun.COM }
42710614SJonathan.Adams@Sun.COM 
42810614SJonathan.Adams@Sun.COM static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
42910614SJonathan.Adams@Sun.COM 	vdev_raidz_map_free_vsd,
43010614SJonathan.Adams@Sun.COM 	vdev_raidz_cksum_report
43110614SJonathan.Adams@Sun.COM };
43210614SJonathan.Adams@Sun.COM 
433789Sahrens static raidz_map_t *
vdev_raidz_map_alloc(zio_t * zio,uint64_t unit_shift,uint64_t dcols,uint64_t nparity)4342082Seschrock vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
4352082Seschrock     uint64_t nparity)
436789Sahrens {
437789Sahrens 	raidz_map_t *rm;
438789Sahrens 	uint64_t b = zio->io_offset >> unit_shift;
439789Sahrens 	uint64_t s = zio->io_size >> unit_shift;
440789Sahrens 	uint64_t f = b % dcols;
441789Sahrens 	uint64_t o = (b / dcols) << unit_shift;
44210105Sadam.leventhal@sun.com 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
443789Sahrens 
4442082Seschrock 	q = s / (dcols - nparity);
4452082Seschrock 	r = s - q * (dcols - nparity);
4462082Seschrock 	bc = (r == 0 ? 0 : r + nparity);
44710105Sadam.leventhal@sun.com 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
448789Sahrens 
44910105Sadam.leventhal@sun.com 	if (q == 0) {
45010105Sadam.leventhal@sun.com 		acols = bc;
45110105Sadam.leventhal@sun.com 		scols = MIN(dcols, roundup(bc, nparity + 1));
45210105Sadam.leventhal@sun.com 	} else {
45310105Sadam.leventhal@sun.com 		acols = dcols;
45410105Sadam.leventhal@sun.com 		scols = dcols;
45510105Sadam.leventhal@sun.com 	}
456789Sahrens 
45710105Sadam.leventhal@sun.com 	ASSERT3U(acols, <=, scols);
45810105Sadam.leventhal@sun.com 
45910105Sadam.leventhal@sun.com 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
460789Sahrens 
461789Sahrens 	rm->rm_cols = acols;
46210105Sadam.leventhal@sun.com 	rm->rm_scols = scols;
463789Sahrens 	rm->rm_bigcols = bc;
46410450Sadam.leventhal@sun.com 	rm->rm_skipstart = bc;
4652082Seschrock 	rm->rm_missingdata = 0;
4662082Seschrock 	rm->rm_missingparity = 0;
4672082Seschrock 	rm->rm_firstdatacol = nparity;
46810614SJonathan.Adams@Sun.COM 	rm->rm_datacopy = NULL;
46910614SJonathan.Adams@Sun.COM 	rm->rm_reports = 0;
47010614SJonathan.Adams@Sun.COM 	rm->rm_freed = 0;
47110614SJonathan.Adams@Sun.COM 	rm->rm_ecksuminjected = 0;
472789Sahrens 
47310105Sadam.leventhal@sun.com 	asize = 0;
47410105Sadam.leventhal@sun.com 
47510105Sadam.leventhal@sun.com 	for (c = 0; c < scols; c++) {
476789Sahrens 		col = f + c;
477789Sahrens 		coff = o;
478789Sahrens 		if (col >= dcols) {
479789Sahrens 			col -= dcols;
480789Sahrens 			coff += 1ULL << unit_shift;
481789Sahrens 		}
4822082Seschrock 		rm->rm_col[c].rc_devidx = col;
483789Sahrens 		rm->rm_col[c].rc_offset = coff;
484789Sahrens 		rm->rm_col[c].rc_data = NULL;
48510614SJonathan.Adams@Sun.COM 		rm->rm_col[c].rc_gdata = NULL;
486789Sahrens 		rm->rm_col[c].rc_error = 0;
487789Sahrens 		rm->rm_col[c].rc_tried = 0;
488789Sahrens 		rm->rm_col[c].rc_skipped = 0;
48910105Sadam.leventhal@sun.com 
49010105Sadam.leventhal@sun.com 		if (c >= acols)
49110105Sadam.leventhal@sun.com 			rm->rm_col[c].rc_size = 0;
49210105Sadam.leventhal@sun.com 		else if (c < bc)
49310105Sadam.leventhal@sun.com 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
49410105Sadam.leventhal@sun.com 		else
49510105Sadam.leventhal@sun.com 			rm->rm_col[c].rc_size = q << unit_shift;
49610105Sadam.leventhal@sun.com 
49710105Sadam.leventhal@sun.com 		asize += rm->rm_col[c].rc_size;
498789Sahrens 	}
499789Sahrens 
50010105Sadam.leventhal@sun.com 	ASSERT3U(asize, ==, tot << unit_shift);
50110105Sadam.leventhal@sun.com 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
50210450Sadam.leventhal@sun.com 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
50310450Sadam.leventhal@sun.com 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
50410450Sadam.leventhal@sun.com 	ASSERT3U(rm->rm_nskip, <=, nparity);
505789Sahrens 
506789Sahrens 	for (c = 0; c < rm->rm_firstdatacol; c++)
507789Sahrens 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
508789Sahrens 
509789Sahrens 	rm->rm_col[c].rc_data = zio->io_data;
510789Sahrens 
511789Sahrens 	for (c = c + 1; c < acols; c++)
512789Sahrens 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
513789Sahrens 		    rm->rm_col[c - 1].rc_size;
514789Sahrens 
5151133Seschrock 	/*
5162082Seschrock 	 * If all data stored spans all columns, there's a danger that parity
5172082Seschrock 	 * will always be on the same device and, since parity isn't read
5182082Seschrock 	 * during normal operation, that that device's I/O bandwidth won't be
5192082Seschrock 	 * used effectively. We therefore switch the parity every 1MB.
5202082Seschrock 	 *
5212082Seschrock 	 * ... at least that was, ostensibly, the theory. As a practical
5222082Seschrock 	 * matter unless we juggle the parity between all devices evenly, we
5232082Seschrock 	 * won't see any benefit. Further, occasional writes that aren't a
5242082Seschrock 	 * multiple of the LCM of the number of children and the minimum
5252082Seschrock 	 * stripe width are sufficient to avoid pessimal behavior.
5262082Seschrock 	 * Unfortunately, this decision created an implicit on-disk format
5273456Sahl 	 * requirement that we need to support for all eternity, but only
5283456Sahl 	 * for single-parity RAID-Z.
52910450Sadam.leventhal@sun.com 	 *
53010450Sadam.leventhal@sun.com 	 * If we intend to skip a sector in the zeroth column for padding
53110450Sadam.leventhal@sun.com 	 * we must make sure to note this swap. We will never intend to
53210450Sadam.leventhal@sun.com 	 * skip the first column since at least one data and one parity
53310450Sadam.leventhal@sun.com 	 * column must appear in each row.
5341133Seschrock 	 */
5351133Seschrock 	ASSERT(rm->rm_cols >= 2);
5361133Seschrock 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
537789Sahrens 
5382082Seschrock 	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
5392082Seschrock 		devidx = rm->rm_col[0].rc_devidx;
5401133Seschrock 		o = rm->rm_col[0].rc_offset;
5412082Seschrock 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
5421133Seschrock 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
5432082Seschrock 		rm->rm_col[1].rc_devidx = devidx;
5441133Seschrock 		rm->rm_col[1].rc_offset = o;
54510450Sadam.leventhal@sun.com 
54610450Sadam.leventhal@sun.com 		if (rm->rm_skipstart == 0)
54710450Sadam.leventhal@sun.com 			rm->rm_skipstart = 1;
548789Sahrens 	}
549789Sahrens 
550789Sahrens 	zio->io_vsd = rm;
55110614SJonathan.Adams@Sun.COM 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
552789Sahrens 	return (rm);
553789Sahrens }
554789Sahrens 
555789Sahrens static void
vdev_raidz_generate_parity_p(raidz_map_t * rm)5562082Seschrock vdev_raidz_generate_parity_p(raidz_map_t *rm)
5572082Seschrock {
5582082Seschrock 	uint64_t *p, *src, pcount, ccount, i;
5592082Seschrock 	int c;
5602082Seschrock 
5612082Seschrock 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
5622082Seschrock 
5632082Seschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
5642082Seschrock 		src = rm->rm_col[c].rc_data;
5652082Seschrock 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
5662082Seschrock 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
5672082Seschrock 
5682082Seschrock 		if (c == rm->rm_firstdatacol) {
5692082Seschrock 			ASSERT(ccount == pcount);
57010105Sadam.leventhal@sun.com 			for (i = 0; i < ccount; i++, src++, p++) {
5712082Seschrock 				*p = *src;
5722082Seschrock 			}
5732082Seschrock 		} else {
5742082Seschrock 			ASSERT(ccount <= pcount);
57510105Sadam.leventhal@sun.com 			for (i = 0; i < ccount; i++, src++, p++) {
5762082Seschrock 				*p ^= *src;
5772082Seschrock 			}
5782082Seschrock 		}
5792082Seschrock 	}
5802082Seschrock }
5812082Seschrock 
5822082Seschrock static void
vdev_raidz_generate_parity_pq(raidz_map_t * rm)5832082Seschrock vdev_raidz_generate_parity_pq(raidz_map_t *rm)
584789Sahrens {
58510105Sadam.leventhal@sun.com 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
5862082Seschrock 	int c;
5872082Seschrock 
58810105Sadam.leventhal@sun.com 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
5892082Seschrock 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
5902082Seschrock 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
5912082Seschrock 
5922082Seschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
5932082Seschrock 		src = rm->rm_col[c].rc_data;
5942082Seschrock 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
5952082Seschrock 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
59610105Sadam.leventhal@sun.com 
59710105Sadam.leventhal@sun.com 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
5982082Seschrock 
5992082Seschrock 		if (c == rm->rm_firstdatacol) {
60010105Sadam.leventhal@sun.com 			ASSERT(ccnt == pcnt || ccnt == 0);
60110105Sadam.leventhal@sun.com 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
6022082Seschrock 				*p = *src;
60310105Sadam.leventhal@sun.com 				*q = *src;
6042082Seschrock 			}
60510105Sadam.leventhal@sun.com 			for (; i < pcnt; i++, src++, p++, q++) {
60610105Sadam.leventhal@sun.com 				*p = 0;
6072082Seschrock 				*q = 0;
6082082Seschrock 			}
6092082Seschrock 		} else {
61010105Sadam.leventhal@sun.com 			ASSERT(ccnt <= pcnt);
611789Sahrens 
6122082Seschrock 			/*
61310105Sadam.leventhal@sun.com 			 * Apply the algorithm described above by multiplying
61410105Sadam.leventhal@sun.com 			 * the previous result and adding in the new value.
6152082Seschrock 			 */
61610105Sadam.leventhal@sun.com 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
61710105Sadam.leventhal@sun.com 				*p ^= *src;
61810105Sadam.leventhal@sun.com 
61910105Sadam.leventhal@sun.com 				VDEV_RAIDZ_64MUL_2(*q, mask);
6202082Seschrock 				*q ^= *src;
6212082Seschrock 			}
6222082Seschrock 
6232082Seschrock 			/*
6242082Seschrock 			 * Treat short columns as though they are full of 0s.
62510105Sadam.leventhal@sun.com 			 * Note that there's therefore nothing needed for P.
6262082Seschrock 			 */
62710105Sadam.leventhal@sun.com 			for (; i < pcnt; i++, q++) {
62810105Sadam.leventhal@sun.com 				VDEV_RAIDZ_64MUL_2(*q, mask);
6292082Seschrock 			}
6302082Seschrock 		}
6312082Seschrock 	}
6322082Seschrock }
6332082Seschrock 
6342082Seschrock static void
vdev_raidz_generate_parity_pqr(raidz_map_t * rm)63510105Sadam.leventhal@sun.com vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
63610105Sadam.leventhal@sun.com {
63710105Sadam.leventhal@sun.com 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
63810105Sadam.leventhal@sun.com 	int c;
63910105Sadam.leventhal@sun.com 
64010105Sadam.leventhal@sun.com 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
64110105Sadam.leventhal@sun.com 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
64210105Sadam.leventhal@sun.com 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
64310105Sadam.leventhal@sun.com 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
64410105Sadam.leventhal@sun.com 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
64510105Sadam.leventhal@sun.com 
64610105Sadam.leventhal@sun.com 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
64710105Sadam.leventhal@sun.com 		src = rm->rm_col[c].rc_data;
64810105Sadam.leventhal@sun.com 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
64910105Sadam.leventhal@sun.com 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
65010105Sadam.leventhal@sun.com 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
65110105Sadam.leventhal@sun.com 
65210105Sadam.leventhal@sun.com 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
65310105Sadam.leventhal@sun.com 
65410105Sadam.leventhal@sun.com 		if (c == rm->rm_firstdatacol) {
65510105Sadam.leventhal@sun.com 			ASSERT(ccnt == pcnt || ccnt == 0);
65610105Sadam.leventhal@sun.com 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
65710105Sadam.leventhal@sun.com 				*p = *src;
65810105Sadam.leventhal@sun.com 				*q = *src;
65910105Sadam.leventhal@sun.com 				*r = *src;
66010105Sadam.leventhal@sun.com 			}
66110105Sadam.leventhal@sun.com 			for (; i < pcnt; i++, src++, p++, q++, r++) {
66210105Sadam.leventhal@sun.com 				*p = 0;
66310105Sadam.leventhal@sun.com 				*q = 0;
66410105Sadam.leventhal@sun.com 				*r = 0;
66510105Sadam.leventhal@sun.com 			}
66610105Sadam.leventhal@sun.com 		} else {
66710105Sadam.leventhal@sun.com 			ASSERT(ccnt <= pcnt);
66810105Sadam.leventhal@sun.com 
66910105Sadam.leventhal@sun.com 			/*
67010105Sadam.leventhal@sun.com 			 * Apply the algorithm described above by multiplying
67110105Sadam.leventhal@sun.com 			 * the previous result and adding in the new value.
67210105Sadam.leventhal@sun.com 			 */
67310105Sadam.leventhal@sun.com 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
67410105Sadam.leventhal@sun.com 				*p ^= *src;
67510105Sadam.leventhal@sun.com 
67610105Sadam.leventhal@sun.com 				VDEV_RAIDZ_64MUL_2(*q, mask);
67710105Sadam.leventhal@sun.com 				*q ^= *src;
67810105Sadam.leventhal@sun.com 
67910105Sadam.leventhal@sun.com 				VDEV_RAIDZ_64MUL_4(*r, mask);
68010105Sadam.leventhal@sun.com 				*r ^= *src;
68110105Sadam.leventhal@sun.com 			}
68210105Sadam.leventhal@sun.com 
68310105Sadam.leventhal@sun.com 			/*
68410105Sadam.leventhal@sun.com 			 * Treat short columns as though they are full of 0s.
68510105Sadam.leventhal@sun.com 			 * Note that there's therefore nothing needed for P.
68610105Sadam.leventhal@sun.com 			 */
68710105Sadam.leventhal@sun.com 			for (; i < pcnt; i++, q++, r++) {
68810105Sadam.leventhal@sun.com 				VDEV_RAIDZ_64MUL_2(*q, mask);
68910105Sadam.leventhal@sun.com 				VDEV_RAIDZ_64MUL_4(*r, mask);
69010105Sadam.leventhal@sun.com 			}
69110105Sadam.leventhal@sun.com 		}
69210105Sadam.leventhal@sun.com 	}
69310105Sadam.leventhal@sun.com }
69410105Sadam.leventhal@sun.com 
69510105Sadam.leventhal@sun.com /*
69610105Sadam.leventhal@sun.com  * Generate RAID parity in the first virtual columns according to the number of
69710105Sadam.leventhal@sun.com  * parity columns available.
69810105Sadam.leventhal@sun.com  */
69910105Sadam.leventhal@sun.com static void
vdev_raidz_generate_parity(raidz_map_t * rm)70010105Sadam.leventhal@sun.com vdev_raidz_generate_parity(raidz_map_t *rm)
70110105Sadam.leventhal@sun.com {
70210105Sadam.leventhal@sun.com 	switch (rm->rm_firstdatacol) {
70310105Sadam.leventhal@sun.com 	case 1:
70410105Sadam.leventhal@sun.com 		vdev_raidz_generate_parity_p(rm);
70510105Sadam.leventhal@sun.com 		break;
70610105Sadam.leventhal@sun.com 	case 2:
70710105Sadam.leventhal@sun.com 		vdev_raidz_generate_parity_pq(rm);
70810105Sadam.leventhal@sun.com 		break;
70910105Sadam.leventhal@sun.com 	case 3:
71010105Sadam.leventhal@sun.com 		vdev_raidz_generate_parity_pqr(rm);
71110105Sadam.leventhal@sun.com 		break;
71210105Sadam.leventhal@sun.com 	default:
71310105Sadam.leventhal@sun.com 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
71410105Sadam.leventhal@sun.com 	}
71510105Sadam.leventhal@sun.com }
71610105Sadam.leventhal@sun.com 
71710105Sadam.leventhal@sun.com static int
vdev_raidz_reconstruct_p(raidz_map_t * rm,int * tgts,int ntgts)71810105Sadam.leventhal@sun.com vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
7192082Seschrock {
7202082Seschrock 	uint64_t *dst, *src, xcount, ccount, count, i;
72110105Sadam.leventhal@sun.com 	int x = tgts[0];
7222082Seschrock 	int c;
7232082Seschrock 
72410105Sadam.leventhal@sun.com 	ASSERT(ntgts == 1);
72510105Sadam.leventhal@sun.com 	ASSERT(x >= rm->rm_firstdatacol);
72610105Sadam.leventhal@sun.com 	ASSERT(x < rm->rm_cols);
72710105Sadam.leventhal@sun.com 
7282082Seschrock 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
7292082Seschrock 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
7302082Seschrock 	ASSERT(xcount > 0);
7312082Seschrock 
7322082Seschrock 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
7332082Seschrock 	dst = rm->rm_col[x].rc_data;
7342082Seschrock 	for (i = 0; i < xcount; i++, dst++, src++) {
7352082Seschrock 		*dst = *src;
7362082Seschrock 	}
7372082Seschrock 
7382082Seschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
739789Sahrens 		src = rm->rm_col[c].rc_data;
740789Sahrens 		dst = rm->rm_col[x].rc_data;
7412082Seschrock 
7422082Seschrock 		if (c == x)
7432082Seschrock 			continue;
7442082Seschrock 
7452082Seschrock 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
7462082Seschrock 		count = MIN(ccount, xcount);
7472082Seschrock 
7482082Seschrock 		for (i = 0; i < count; i++, dst++, src++) {
7492082Seschrock 			*dst ^= *src;
750789Sahrens 		}
751789Sahrens 	}
75210105Sadam.leventhal@sun.com 
75310105Sadam.leventhal@sun.com 	return (1 << VDEV_RAIDZ_P);
754789Sahrens }
755789Sahrens 
75610105Sadam.leventhal@sun.com static int
vdev_raidz_reconstruct_q(raidz_map_t * rm,int * tgts,int ntgts)75710105Sadam.leventhal@sun.com vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
7582082Seschrock {
7592082Seschrock 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
7602082Seschrock 	uint8_t *b;
76110105Sadam.leventhal@sun.com 	int x = tgts[0];
7622082Seschrock 	int c, j, exp;
7632082Seschrock 
76410105Sadam.leventhal@sun.com 	ASSERT(ntgts == 1);
76510105Sadam.leventhal@sun.com 
7662082Seschrock 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
7672082Seschrock 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
7682082Seschrock 
7692082Seschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
7702082Seschrock 		src = rm->rm_col[c].rc_data;
7712082Seschrock 		dst = rm->rm_col[x].rc_data;
7722082Seschrock 
7732082Seschrock 		if (c == x)
7742082Seschrock 			ccount = 0;
7752082Seschrock 		else
7762082Seschrock 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
7772082Seschrock 
7782082Seschrock 		count = MIN(ccount, xcount);
7792082Seschrock 
7802082Seschrock 		if (c == rm->rm_firstdatacol) {
7812082Seschrock 			for (i = 0; i < count; i++, dst++, src++) {
7822082Seschrock 				*dst = *src;
7832082Seschrock 			}
7842082Seschrock 			for (; i < xcount; i++, dst++) {
7852082Seschrock 				*dst = 0;
7862082Seschrock 			}
7872082Seschrock 
7882082Seschrock 		} else {
7892082Seschrock 			for (i = 0; i < count; i++, dst++, src++) {
79010105Sadam.leventhal@sun.com 				VDEV_RAIDZ_64MUL_2(*dst, mask);
7912082Seschrock 				*dst ^= *src;
7922082Seschrock 			}
7932082Seschrock 
7942082Seschrock 			for (; i < xcount; i++, dst++) {
79510105Sadam.leventhal@sun.com 				VDEV_RAIDZ_64MUL_2(*dst, mask);
7962082Seschrock 			}
7972082Seschrock 		}
7982082Seschrock 	}
7992082Seschrock 
8002082Seschrock 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
8012082Seschrock 	dst = rm->rm_col[x].rc_data;
8022082Seschrock 	exp = 255 - (rm->rm_cols - 1 - x);
8032082Seschrock 
8042082Seschrock 	for (i = 0; i < xcount; i++, dst++, src++) {
8052082Seschrock 		*dst ^= *src;
8062082Seschrock 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
8072082Seschrock 			*b = vdev_raidz_exp2(*b, exp);
8082082Seschrock 		}
8092082Seschrock 	}
81010105Sadam.leventhal@sun.com 
81110105Sadam.leventhal@sun.com 	return (1 << VDEV_RAIDZ_Q);
8122082Seschrock }
8132082Seschrock 
81410105Sadam.leventhal@sun.com static int
vdev_raidz_reconstruct_pq(raidz_map_t * rm,int * tgts,int ntgts)81510105Sadam.leventhal@sun.com vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
8162082Seschrock {
8172082Seschrock 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
8182082Seschrock 	void *pdata, *qdata;
8192082Seschrock 	uint64_t xsize, ysize, i;
82010105Sadam.leventhal@sun.com 	int x = tgts[0];
82110105Sadam.leventhal@sun.com 	int y = tgts[1];
8222082Seschrock 
82310105Sadam.leventhal@sun.com 	ASSERT(ntgts == 2);
8242082Seschrock 	ASSERT(x < y);
8252082Seschrock 	ASSERT(x >= rm->rm_firstdatacol);
8262082Seschrock 	ASSERT(y < rm->rm_cols);
8272082Seschrock 
8282082Seschrock 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
8292082Seschrock 
8302082Seschrock 	/*
8312082Seschrock 	 * Move the parity data aside -- we're going to compute parity as
8322082Seschrock 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
8332082Seschrock 	 * reuse the parity generation mechanism without trashing the actual
8342082Seschrock 	 * parity so we make those columns appear to be full of zeros by
8352082Seschrock 	 * setting their lengths to zero.
8362082Seschrock 	 */
8372082Seschrock 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
8382082Seschrock 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
8392082Seschrock 	xsize = rm->rm_col[x].rc_size;
8402082Seschrock 	ysize = rm->rm_col[y].rc_size;
8412082Seschrock 
8422082Seschrock 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
8432082Seschrock 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
8442082Seschrock 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
8452082Seschrock 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
8462082Seschrock 	rm->rm_col[x].rc_size = 0;
8472082Seschrock 	rm->rm_col[y].rc_size = 0;
8482082Seschrock 
8492082Seschrock 	vdev_raidz_generate_parity_pq(rm);
8502082Seschrock 
8512082Seschrock 	rm->rm_col[x].rc_size = xsize;
8522082Seschrock 	rm->rm_col[y].rc_size = ysize;
8532082Seschrock 
8542082Seschrock 	p = pdata;
8552082Seschrock 	q = qdata;
8562082Seschrock 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
8572082Seschrock 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
8582082Seschrock 	xd = rm->rm_col[x].rc_data;
8592082Seschrock 	yd = rm->rm_col[y].rc_data;
8602082Seschrock 
8612082Seschrock 	/*
8622082Seschrock 	 * We now have:
8632082Seschrock 	 *	Pxy = P + D_x + D_y
8642082Seschrock 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
8652082Seschrock 	 *
8662082Seschrock 	 * We can then solve for D_x:
8672082Seschrock 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
8682082Seschrock 	 * where
8692082Seschrock 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
8702082Seschrock 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
8712082Seschrock 	 *
8722082Seschrock 	 * With D_x in hand, we can easily solve for D_y:
8732082Seschrock 	 *	D_y = P + Pxy + D_x
8742082Seschrock 	 */
8752082Seschrock 
8762082Seschrock 	a = vdev_raidz_pow2[255 + x - y];
8772082Seschrock 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
8782082Seschrock 	tmp = 255 - vdev_raidz_log2[a ^ 1];
8792082Seschrock 
8802082Seschrock 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
8812082Seschrock 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
8822082Seschrock 
8832082Seschrock 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
8842082Seschrock 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
8852082Seschrock 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
8862082Seschrock 
8872082Seschrock 		if (i < ysize)
8882082Seschrock 			*yd = *p ^ *pxy ^ *xd;
8892082Seschrock 	}
8902082Seschrock 
8912082Seschrock 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
8922082Seschrock 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
8932082Seschrock 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
8942082Seschrock 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
8952082Seschrock 
8962082Seschrock 	/*
8972082Seschrock 	 * Restore the saved parity data.
8982082Seschrock 	 */
8992082Seschrock 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
9002082Seschrock 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
90110105Sadam.leventhal@sun.com 
90210105Sadam.leventhal@sun.com 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
90310105Sadam.leventhal@sun.com }
90410105Sadam.leventhal@sun.com 
90510105Sadam.leventhal@sun.com /* BEGIN CSTYLED */
90610105Sadam.leventhal@sun.com /*
90710105Sadam.leventhal@sun.com  * In the general case of reconstruction, we must solve the system of linear
90810105Sadam.leventhal@sun.com  * equations defined by the coeffecients used to generate parity as well as
90910105Sadam.leventhal@sun.com  * the contents of the data and parity disks. This can be expressed with
91010105Sadam.leventhal@sun.com  * vectors for the original data (D) and the actual data (d) and parity (p)
91110105Sadam.leventhal@sun.com  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
91210105Sadam.leventhal@sun.com  *
91310105Sadam.leventhal@sun.com  *            __   __                     __     __
91410105Sadam.leventhal@sun.com  *            |     |         __     __   |  p_0  |
91510105Sadam.leventhal@sun.com  *            |  V  |         |  D_0  |   | p_m-1 |
91610105Sadam.leventhal@sun.com  *            |     |    x    |   :   | = |  d_0  |
91710105Sadam.leventhal@sun.com  *            |  I  |         | D_n-1 |   |   :   |
91810105Sadam.leventhal@sun.com  *            |     |         ~~     ~~   | d_n-1 |
91910105Sadam.leventhal@sun.com  *            ~~   ~~                     ~~     ~~
92010105Sadam.leventhal@sun.com  *
92110105Sadam.leventhal@sun.com  * I is simply a square identity matrix of size n, and V is a vandermonde
92210105Sadam.leventhal@sun.com  * matrix defined by the coeffecients we chose for the various parity columns
92310105Sadam.leventhal@sun.com  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
92410105Sadam.leventhal@sun.com  * computation as well as linear separability.
92510105Sadam.leventhal@sun.com  *
92610105Sadam.leventhal@sun.com  *      __               __               __     __
92710105Sadam.leventhal@sun.com  *      |   1   ..  1 1 1 |               |  p_0  |
92810105Sadam.leventhal@sun.com  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
92910105Sadam.leventhal@sun.com  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
93010105Sadam.leventhal@sun.com  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
93110105Sadam.leventhal@sun.com  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
93210105Sadam.leventhal@sun.com  *      |   :       : : : |   |   :   |   |  d_2  |
93310105Sadam.leventhal@sun.com  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
93410105Sadam.leventhal@sun.com  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
93510105Sadam.leventhal@sun.com  *      |   0   ..  0 0 1 |               | d_n-1 |
93610105Sadam.leventhal@sun.com  *      ~~               ~~               ~~     ~~
93710105Sadam.leventhal@sun.com  *
93810105Sadam.leventhal@sun.com  * Note that I, V, d, and p are known. To compute D, we must invert the
93910105Sadam.leventhal@sun.com  * matrix and use the known data and parity values to reconstruct the unknown
94010105Sadam.leventhal@sun.com  * data values. We begin by removing the rows in V|I and d|p that correspond
94110105Sadam.leventhal@sun.com  * to failed or missing columns; we then make V|I square (n x n) and d|p
94210105Sadam.leventhal@sun.com  * sized n by removing rows corresponding to unused parity from the bottom up
94310105Sadam.leventhal@sun.com  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
94410105Sadam.leventhal@sun.com  * using Gauss-Jordan elimination. In the example below we use m=3 parity
94510105Sadam.leventhal@sun.com  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
94610105Sadam.leventhal@sun.com  *           __                               __
94710105Sadam.leventhal@sun.com  *           |  1   1   1   1   1   1   1   1  |
94810105Sadam.leventhal@sun.com  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
94910105Sadam.leventhal@sun.com  *           |  19 205 116  29  64  16  4   1  |      / /
95010105Sadam.leventhal@sun.com  *           |  1   0   0   0   0   0   0   0  |     / /
95110105Sadam.leventhal@sun.com  *           |  0   1   0   0   0   0   0   0  | <--' /
95210105Sadam.leventhal@sun.com  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
95310105Sadam.leventhal@sun.com  *           |  0   0   0   1   0   0   0   0  |
95410105Sadam.leventhal@sun.com  *           |  0   0   0   0   1   0   0   0  |
95510105Sadam.leventhal@sun.com  *           |  0   0   0   0   0   1   0   0  |
95610105Sadam.leventhal@sun.com  *           |  0   0   0   0   0   0   1   0  |
95710105Sadam.leventhal@sun.com  *           |  0   0   0   0   0   0   0   1  |
95810105Sadam.leventhal@sun.com  *           ~~                               ~~
95910105Sadam.leventhal@sun.com  *           __                               __
96010105Sadam.leventhal@sun.com  *           |  1   1   1   1   1   1   1   1  |
96110105Sadam.leventhal@sun.com  *           | 128  64  32  16  8   4   2   1  |
96210105Sadam.leventhal@sun.com  *           |  19 205 116  29  64  16  4   1  |
96310105Sadam.leventhal@sun.com  *           |  1   0   0   0   0   0   0   0  |
96410105Sadam.leventhal@sun.com  *           |  0   1   0   0   0   0   0   0  |
96510105Sadam.leventhal@sun.com  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
96610105Sadam.leventhal@sun.com  *           |  0   0   0   1   0   0   0   0  |
96710105Sadam.leventhal@sun.com  *           |  0   0   0   0   1   0   0   0  |
96810105Sadam.leventhal@sun.com  *           |  0   0   0   0   0   1   0   0  |
96910105Sadam.leventhal@sun.com  *           |  0   0   0   0   0   0   1   0  |
97010105Sadam.leventhal@sun.com  *           |  0   0   0   0   0   0   0   1  |
97110105Sadam.leventhal@sun.com  *           ~~                               ~~
97210105Sadam.leventhal@sun.com  *
97310105Sadam.leventhal@sun.com  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
97410105Sadam.leventhal@sun.com  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
97510105Sadam.leventhal@sun.com  * matrix is not singular.
97610105Sadam.leventhal@sun.com  * __                                                                 __
97710105Sadam.leventhal@sun.com  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
97810105Sadam.leventhal@sun.com  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
97910105Sadam.leventhal@sun.com  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
98010105Sadam.leventhal@sun.com  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
98110105Sadam.leventhal@sun.com  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
98210105Sadam.leventhal@sun.com  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
98310105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
98410105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
98510105Sadam.leventhal@sun.com  * ~~                                                                 ~~
98610105Sadam.leventhal@sun.com  * __                                                                 __
98710105Sadam.leventhal@sun.com  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
98810105Sadam.leventhal@sun.com  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
98910105Sadam.leventhal@sun.com  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
99010105Sadam.leventhal@sun.com  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
99110105Sadam.leventhal@sun.com  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
99210105Sadam.leventhal@sun.com  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
99310105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
99410105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
99510105Sadam.leventhal@sun.com  * ~~                                                                 ~~
99610105Sadam.leventhal@sun.com  * __                                                                 __
99710105Sadam.leventhal@sun.com  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
99810105Sadam.leventhal@sun.com  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
99910105Sadam.leventhal@sun.com  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
100010105Sadam.leventhal@sun.com  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
100110105Sadam.leventhal@sun.com  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
100210105Sadam.leventhal@sun.com  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
100310105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
100410105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
100510105Sadam.leventhal@sun.com  * ~~                                                                 ~~
100610105Sadam.leventhal@sun.com  * __                                                                 __
100710105Sadam.leventhal@sun.com  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
100810105Sadam.leventhal@sun.com  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
100910105Sadam.leventhal@sun.com  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
101010105Sadam.leventhal@sun.com  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
101110105Sadam.leventhal@sun.com  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
101210105Sadam.leventhal@sun.com  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
101310105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
101410105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
101510105Sadam.leventhal@sun.com  * ~~                                                                 ~~
101610105Sadam.leventhal@sun.com  * __                                                                 __
101710105Sadam.leventhal@sun.com  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
101810105Sadam.leventhal@sun.com  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
101910105Sadam.leventhal@sun.com  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
102010105Sadam.leventhal@sun.com  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
102110105Sadam.leventhal@sun.com  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
102210105Sadam.leventhal@sun.com  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
102310105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
102410105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
102510105Sadam.leventhal@sun.com  * ~~                                                                 ~~
102610105Sadam.leventhal@sun.com  * __                                                                 __
102710105Sadam.leventhal@sun.com  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
102810105Sadam.leventhal@sun.com  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
102910105Sadam.leventhal@sun.com  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
103010105Sadam.leventhal@sun.com  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
103110105Sadam.leventhal@sun.com  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
103210105Sadam.leventhal@sun.com  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
103310105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
103410105Sadam.leventhal@sun.com  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
103510105Sadam.leventhal@sun.com  * ~~                                                                 ~~
103610105Sadam.leventhal@sun.com  *                   __                               __
103710105Sadam.leventhal@sun.com  *                   |  0   0   1   0   0   0   0   0  |
103810105Sadam.leventhal@sun.com  *                   | 167 100  5   41 159 169 217 208 |
103910105Sadam.leventhal@sun.com  *                   | 166 100  4   40 158 168 216 209 |
104010105Sadam.leventhal@sun.com  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
104110105Sadam.leventhal@sun.com  *                   |  0   0   0   0   1   0   0   0  |
104210105Sadam.leventhal@sun.com  *                   |  0   0   0   0   0   1   0   0  |
104310105Sadam.leventhal@sun.com  *                   |  0   0   0   0   0   0   1   0  |
104410105Sadam.leventhal@sun.com  *                   |  0   0   0   0   0   0   0   1  |
104510105Sadam.leventhal@sun.com  *                   ~~                               ~~
104610105Sadam.leventhal@sun.com  *
104710105Sadam.leventhal@sun.com  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
104810105Sadam.leventhal@sun.com  * of the missing data.
104910105Sadam.leventhal@sun.com  *
105010105Sadam.leventhal@sun.com  * As is apparent from the example above, the only non-trivial rows in the
105110105Sadam.leventhal@sun.com  * inverse matrix correspond to the data disks that we're trying to
105210105Sadam.leventhal@sun.com  * reconstruct. Indeed, those are the only rows we need as the others would
105310105Sadam.leventhal@sun.com  * only be useful for reconstructing data known or assumed to be valid. For
105410105Sadam.leventhal@sun.com  * that reason, we only build the coefficients in the rows that correspond to
105510105Sadam.leventhal@sun.com  * targeted columns.
105610105Sadam.leventhal@sun.com  */
105710105Sadam.leventhal@sun.com /* END CSTYLED */
105810105Sadam.leventhal@sun.com 
105910105Sadam.leventhal@sun.com static void
vdev_raidz_matrix_init(raidz_map_t * rm,int n,int nmap,int * map,uint8_t ** rows)106010105Sadam.leventhal@sun.com vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
106110105Sadam.leventhal@sun.com     uint8_t **rows)
106210105Sadam.leventhal@sun.com {
106310105Sadam.leventhal@sun.com 	int i, j;
106410105Sadam.leventhal@sun.com 	int pow;
106510105Sadam.leventhal@sun.com 
106610105Sadam.leventhal@sun.com 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
106710105Sadam.leventhal@sun.com 
106810105Sadam.leventhal@sun.com 	/*
106910105Sadam.leventhal@sun.com 	 * Fill in the missing rows of interest.
107010105Sadam.leventhal@sun.com 	 */
107110105Sadam.leventhal@sun.com 	for (i = 0; i < nmap; i++) {
107210105Sadam.leventhal@sun.com 		ASSERT3S(0, <=, map[i]);
107310105Sadam.leventhal@sun.com 		ASSERT3S(map[i], <=, 2);
107410105Sadam.leventhal@sun.com 
107510105Sadam.leventhal@sun.com 		pow = map[i] * n;
107610105Sadam.leventhal@sun.com 		if (pow > 255)
107710105Sadam.leventhal@sun.com 			pow -= 255;
107810105Sadam.leventhal@sun.com 		ASSERT(pow <= 255);
107910105Sadam.leventhal@sun.com 
108010105Sadam.leventhal@sun.com 		for (j = 0; j < n; j++) {
108110105Sadam.leventhal@sun.com 			pow -= map[i];
108210105Sadam.leventhal@sun.com 			if (pow < 0)
108310105Sadam.leventhal@sun.com 				pow += 255;
108410105Sadam.leventhal@sun.com 			rows[i][j] = vdev_raidz_pow2[pow];
108510105Sadam.leventhal@sun.com 		}
108610105Sadam.leventhal@sun.com 	}
10872082Seschrock }
10882082Seschrock 
108910105Sadam.leventhal@sun.com static void
vdev_raidz_matrix_invert(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** rows,uint8_t ** invrows,const uint8_t * used)109010105Sadam.leventhal@sun.com vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
109110105Sadam.leventhal@sun.com     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
109210105Sadam.leventhal@sun.com {
109310105Sadam.leventhal@sun.com 	int i, j, ii, jj;
109410105Sadam.leventhal@sun.com 	uint8_t log;
109510105Sadam.leventhal@sun.com 
109610105Sadam.leventhal@sun.com 	/*
109710105Sadam.leventhal@sun.com 	 * Assert that the first nmissing entries from the array of used
109810105Sadam.leventhal@sun.com 	 * columns correspond to parity columns and that subsequent entries
109910105Sadam.leventhal@sun.com 	 * correspond to data columns.
110010105Sadam.leventhal@sun.com 	 */
110110105Sadam.leventhal@sun.com 	for (i = 0; i < nmissing; i++) {
110210105Sadam.leventhal@sun.com 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
110310105Sadam.leventhal@sun.com 	}
110410105Sadam.leventhal@sun.com 	for (; i < n; i++) {
110510105Sadam.leventhal@sun.com 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
110610105Sadam.leventhal@sun.com 	}
110710105Sadam.leventhal@sun.com 
110810105Sadam.leventhal@sun.com 	/*
110910105Sadam.leventhal@sun.com 	 * First initialize the storage where we'll compute the inverse rows.
111010105Sadam.leventhal@sun.com 	 */
111110105Sadam.leventhal@sun.com 	for (i = 0; i < nmissing; i++) {
111210105Sadam.leventhal@sun.com 		for (j = 0; j < n; j++) {
111310105Sadam.leventhal@sun.com 			invrows[i][j] = (i == j) ? 1 : 0;
111410105Sadam.leventhal@sun.com 		}
111510105Sadam.leventhal@sun.com 	}
111610105Sadam.leventhal@sun.com 
111710105Sadam.leventhal@sun.com 	/*
111810105Sadam.leventhal@sun.com 	 * Subtract all trivial rows from the rows of consequence.
111910105Sadam.leventhal@sun.com 	 */
112010105Sadam.leventhal@sun.com 	for (i = 0; i < nmissing; i++) {
112110105Sadam.leventhal@sun.com 		for (j = nmissing; j < n; j++) {
112210105Sadam.leventhal@sun.com 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
112310105Sadam.leventhal@sun.com 			jj = used[j] - rm->rm_firstdatacol;
112410105Sadam.leventhal@sun.com 			ASSERT3S(jj, <, n);
112510105Sadam.leventhal@sun.com 			invrows[i][j] = rows[i][jj];
112610105Sadam.leventhal@sun.com 			rows[i][jj] = 0;
112710105Sadam.leventhal@sun.com 		}
112810105Sadam.leventhal@sun.com 	}
112910105Sadam.leventhal@sun.com 
113010105Sadam.leventhal@sun.com 	/*
113110105Sadam.leventhal@sun.com 	 * For each of the rows of interest, we must normalize it and subtract
113210105Sadam.leventhal@sun.com 	 * a multiple of it from the other rows.
113310105Sadam.leventhal@sun.com 	 */
113410105Sadam.leventhal@sun.com 	for (i = 0; i < nmissing; i++) {
113510105Sadam.leventhal@sun.com 		for (j = 0; j < missing[i]; j++) {
113610105Sadam.leventhal@sun.com 			ASSERT3U(rows[i][j], ==, 0);
113710105Sadam.leventhal@sun.com 		}
113810105Sadam.leventhal@sun.com 		ASSERT3U(rows[i][missing[i]], !=, 0);
113910105Sadam.leventhal@sun.com 
114010105Sadam.leventhal@sun.com 		/*
114110105Sadam.leventhal@sun.com 		 * Compute the inverse of the first element and multiply each
114210105Sadam.leventhal@sun.com 		 * element in the row by that value.
114310105Sadam.leventhal@sun.com 		 */
114410105Sadam.leventhal@sun.com 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
114510105Sadam.leventhal@sun.com 
114610105Sadam.leventhal@sun.com 		for (j = 0; j < n; j++) {
114710105Sadam.leventhal@sun.com 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
114810105Sadam.leventhal@sun.com 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
114910105Sadam.leventhal@sun.com 		}
115010105Sadam.leventhal@sun.com 
115110105Sadam.leventhal@sun.com 		for (ii = 0; ii < nmissing; ii++) {
115210105Sadam.leventhal@sun.com 			if (i == ii)
115310105Sadam.leventhal@sun.com 				continue;
115410105Sadam.leventhal@sun.com 
115510105Sadam.leventhal@sun.com 			ASSERT3U(rows[ii][missing[i]], !=, 0);
115610105Sadam.leventhal@sun.com 
115710105Sadam.leventhal@sun.com 			log = vdev_raidz_log2[rows[ii][missing[i]]];
115810105Sadam.leventhal@sun.com 
115910105Sadam.leventhal@sun.com 			for (j = 0; j < n; j++) {
116010105Sadam.leventhal@sun.com 				rows[ii][j] ^=
116110105Sadam.leventhal@sun.com 				    vdev_raidz_exp2(rows[i][j], log);
116210105Sadam.leventhal@sun.com 				invrows[ii][j] ^=
116310105Sadam.leventhal@sun.com 				    vdev_raidz_exp2(invrows[i][j], log);
116410105Sadam.leventhal@sun.com 			}
116510105Sadam.leventhal@sun.com 		}
116610105Sadam.leventhal@sun.com 	}
116710105Sadam.leventhal@sun.com 
116810105Sadam.leventhal@sun.com 	/*
116910105Sadam.leventhal@sun.com 	 * Verify that the data that is left in the rows are properly part of
117010105Sadam.leventhal@sun.com 	 * an identity matrix.
117110105Sadam.leventhal@sun.com 	 */
117210105Sadam.leventhal@sun.com 	for (i = 0; i < nmissing; i++) {
117310105Sadam.leventhal@sun.com 		for (j = 0; j < n; j++) {
117410105Sadam.leventhal@sun.com 			if (j == missing[i]) {
117510105Sadam.leventhal@sun.com 				ASSERT3U(rows[i][j], ==, 1);
117610105Sadam.leventhal@sun.com 			} else {
117710105Sadam.leventhal@sun.com 				ASSERT3U(rows[i][j], ==, 0);
117810105Sadam.leventhal@sun.com 			}
117910105Sadam.leventhal@sun.com 		}
118010105Sadam.leventhal@sun.com 	}
118110105Sadam.leventhal@sun.com }
118210105Sadam.leventhal@sun.com 
118310105Sadam.leventhal@sun.com static void
vdev_raidz_matrix_reconstruct(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** invrows,const uint8_t * used)118410105Sadam.leventhal@sun.com vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
118510105Sadam.leventhal@sun.com     int *missing, uint8_t **invrows, const uint8_t *used)
118610105Sadam.leventhal@sun.com {
118710105Sadam.leventhal@sun.com 	int i, j, x, cc, c;
118810105Sadam.leventhal@sun.com 	uint8_t *src;
118910105Sadam.leventhal@sun.com 	uint64_t ccount;
119010105Sadam.leventhal@sun.com 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
119110105Sadam.leventhal@sun.com 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
119210105Sadam.leventhal@sun.com 	uint8_t log, val;
119310105Sadam.leventhal@sun.com 	int ll;
119410105Sadam.leventhal@sun.com 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
119510105Sadam.leventhal@sun.com 	uint8_t *p, *pp;
119610105Sadam.leventhal@sun.com 	size_t psize;
119710105Sadam.leventhal@sun.com 
119810105Sadam.leventhal@sun.com 	psize = sizeof (invlog[0][0]) * n * nmissing;
119910105Sadam.leventhal@sun.com 	p = kmem_alloc(psize, KM_SLEEP);
120010105Sadam.leventhal@sun.com 
120110105Sadam.leventhal@sun.com 	for (pp = p, i = 0; i < nmissing; i++) {
120210105Sadam.leventhal@sun.com 		invlog[i] = pp;
120310105Sadam.leventhal@sun.com 		pp += n;
120410105Sadam.leventhal@sun.com 	}
120510105Sadam.leventhal@sun.com 
120610105Sadam.leventhal@sun.com 	for (i = 0; i < nmissing; i++) {
120710105Sadam.leventhal@sun.com 		for (j = 0; j < n; j++) {
120810105Sadam.leventhal@sun.com 			ASSERT3U(invrows[i][j], !=, 0);
120910105Sadam.leventhal@sun.com 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
121010105Sadam.leventhal@sun.com 		}
121110105Sadam.leventhal@sun.com 	}
121210105Sadam.leventhal@sun.com 
121310105Sadam.leventhal@sun.com 	for (i = 0; i < n; i++) {
121410105Sadam.leventhal@sun.com 		c = used[i];
121510105Sadam.leventhal@sun.com 		ASSERT3U(c, <, rm->rm_cols);
121610105Sadam.leventhal@sun.com 
121710105Sadam.leventhal@sun.com 		src = rm->rm_col[c].rc_data;
121810105Sadam.leventhal@sun.com 		ccount = rm->rm_col[c].rc_size;
121910105Sadam.leventhal@sun.com 		for (j = 0; j < nmissing; j++) {
122010105Sadam.leventhal@sun.com 			cc = missing[j] + rm->rm_firstdatacol;
122110105Sadam.leventhal@sun.com 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
122210105Sadam.leventhal@sun.com 			ASSERT3U(cc, <, rm->rm_cols);
122310105Sadam.leventhal@sun.com 			ASSERT3U(cc, !=, c);
122410105Sadam.leventhal@sun.com 
122510105Sadam.leventhal@sun.com 			dst[j] = rm->rm_col[cc].rc_data;
122610105Sadam.leventhal@sun.com 			dcount[j] = rm->rm_col[cc].rc_size;
122710105Sadam.leventhal@sun.com 		}
122810105Sadam.leventhal@sun.com 
122910105Sadam.leventhal@sun.com 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
123010105Sadam.leventhal@sun.com 
123110105Sadam.leventhal@sun.com 		for (x = 0; x < ccount; x++, src++) {
123210105Sadam.leventhal@sun.com 			if (*src != 0)
123310105Sadam.leventhal@sun.com 				log = vdev_raidz_log2[*src];
123410105Sadam.leventhal@sun.com 
123510105Sadam.leventhal@sun.com 			for (cc = 0; cc < nmissing; cc++) {
123610105Sadam.leventhal@sun.com 				if (x >= dcount[cc])
123710105Sadam.leventhal@sun.com 					continue;
123810105Sadam.leventhal@sun.com 
123910105Sadam.leventhal@sun.com 				if (*src == 0) {
124010105Sadam.leventhal@sun.com 					val = 0;
124110105Sadam.leventhal@sun.com 				} else {
124210105Sadam.leventhal@sun.com 					if ((ll = log + invlog[cc][i]) >= 255)
124310105Sadam.leventhal@sun.com 						ll -= 255;
124410105Sadam.leventhal@sun.com 					val = vdev_raidz_pow2[ll];
124510105Sadam.leventhal@sun.com 				}
124610105Sadam.leventhal@sun.com 
124710105Sadam.leventhal@sun.com 				if (i == 0)
124810105Sadam.leventhal@sun.com 					dst[cc][x] = val;
124910105Sadam.leventhal@sun.com 				else
125010105Sadam.leventhal@sun.com 					dst[cc][x] ^= val;
125110105Sadam.leventhal@sun.com 			}
125210105Sadam.leventhal@sun.com 		}
125310105Sadam.leventhal@sun.com 	}
125410105Sadam.leventhal@sun.com 
125510105Sadam.leventhal@sun.com 	kmem_free(p, psize);
125610105Sadam.leventhal@sun.com }
125710105Sadam.leventhal@sun.com 
125810105Sadam.leventhal@sun.com static int
vdev_raidz_reconstruct_general(raidz_map_t * rm,int * tgts,int ntgts)125910105Sadam.leventhal@sun.com vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
126010105Sadam.leventhal@sun.com {
126110105Sadam.leventhal@sun.com 	int n, i, c, t, tt;
126210105Sadam.leventhal@sun.com 	int nmissing_rows;
126310105Sadam.leventhal@sun.com 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
126410105Sadam.leventhal@sun.com 	int parity_map[VDEV_RAIDZ_MAXPARITY];
126510105Sadam.leventhal@sun.com 
126610105Sadam.leventhal@sun.com 	uint8_t *p, *pp;
126710105Sadam.leventhal@sun.com 	size_t psize;
126810105Sadam.leventhal@sun.com 
126910105Sadam.leventhal@sun.com 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
127010105Sadam.leventhal@sun.com 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
127110105Sadam.leventhal@sun.com 	uint8_t *used;
127210105Sadam.leventhal@sun.com 
127310105Sadam.leventhal@sun.com 	int code = 0;
127410105Sadam.leventhal@sun.com 
127510105Sadam.leventhal@sun.com 
127610105Sadam.leventhal@sun.com 	n = rm->rm_cols - rm->rm_firstdatacol;
127710105Sadam.leventhal@sun.com 
127810105Sadam.leventhal@sun.com 	/*
127910105Sadam.leventhal@sun.com 	 * Figure out which data columns are missing.
128010105Sadam.leventhal@sun.com 	 */
128110105Sadam.leventhal@sun.com 	nmissing_rows = 0;
128210105Sadam.leventhal@sun.com 	for (t = 0; t < ntgts; t++) {
128310105Sadam.leventhal@sun.com 		if (tgts[t] >= rm->rm_firstdatacol) {
128410105Sadam.leventhal@sun.com 			missing_rows[nmissing_rows++] =
128510105Sadam.leventhal@sun.com 			    tgts[t] - rm->rm_firstdatacol;
128610105Sadam.leventhal@sun.com 		}
128710105Sadam.leventhal@sun.com 	}
128810105Sadam.leventhal@sun.com 
128910105Sadam.leventhal@sun.com 	/*
129010105Sadam.leventhal@sun.com 	 * Figure out which parity columns to use to help generate the missing
129110105Sadam.leventhal@sun.com 	 * data columns.
129210105Sadam.leventhal@sun.com 	 */
129310105Sadam.leventhal@sun.com 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
129410105Sadam.leventhal@sun.com 		ASSERT(tt < ntgts);
129510105Sadam.leventhal@sun.com 		ASSERT(c < rm->rm_firstdatacol);
129610105Sadam.leventhal@sun.com 
129710105Sadam.leventhal@sun.com 		/*
129810105Sadam.leventhal@sun.com 		 * Skip any targeted parity columns.
129910105Sadam.leventhal@sun.com 		 */
130010105Sadam.leventhal@sun.com 		if (c == tgts[tt]) {
130110105Sadam.leventhal@sun.com 			tt++;
130210105Sadam.leventhal@sun.com 			continue;
130310105Sadam.leventhal@sun.com 		}
130410105Sadam.leventhal@sun.com 
130510105Sadam.leventhal@sun.com 		code |= 1 << c;
130610105Sadam.leventhal@sun.com 
130710105Sadam.leventhal@sun.com 		parity_map[i] = c;
130810105Sadam.leventhal@sun.com 		i++;
130910105Sadam.leventhal@sun.com 	}
131010105Sadam.leventhal@sun.com 
131110105Sadam.leventhal@sun.com 	ASSERT(code != 0);
131210105Sadam.leventhal@sun.com 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
131310105Sadam.leventhal@sun.com 
131410105Sadam.leventhal@sun.com 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
131510105Sadam.leventhal@sun.com 	    nmissing_rows * n + sizeof (used[0]) * n;
131610105Sadam.leventhal@sun.com 	p = kmem_alloc(psize, KM_SLEEP);
131710105Sadam.leventhal@sun.com 
131810105Sadam.leventhal@sun.com 	for (pp = p, i = 0; i < nmissing_rows; i++) {
131910105Sadam.leventhal@sun.com 		rows[i] = pp;
132010105Sadam.leventhal@sun.com 		pp += n;
132110105Sadam.leventhal@sun.com 		invrows[i] = pp;
132210105Sadam.leventhal@sun.com 		pp += n;
132310105Sadam.leventhal@sun.com 	}
132410105Sadam.leventhal@sun.com 	used = pp;
132510105Sadam.leventhal@sun.com 
132610105Sadam.leventhal@sun.com 	for (i = 0; i < nmissing_rows; i++) {
132710105Sadam.leventhal@sun.com 		used[i] = parity_map[i];
132810105Sadam.leventhal@sun.com 	}
132910105Sadam.leventhal@sun.com 
133010105Sadam.leventhal@sun.com 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
133110105Sadam.leventhal@sun.com 		if (tt < nmissing_rows &&
133210105Sadam.leventhal@sun.com 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
133310105Sadam.leventhal@sun.com 			tt++;
133410105Sadam.leventhal@sun.com 			continue;
133510105Sadam.leventhal@sun.com 		}
133610105Sadam.leventhal@sun.com 
133710105Sadam.leventhal@sun.com 		ASSERT3S(i, <, n);
133810105Sadam.leventhal@sun.com 		used[i] = c;
133910105Sadam.leventhal@sun.com 		i++;
134010105Sadam.leventhal@sun.com 	}
134110105Sadam.leventhal@sun.com 
134210105Sadam.leventhal@sun.com 	/*
134310105Sadam.leventhal@sun.com 	 * Initialize the interesting rows of the matrix.
134410105Sadam.leventhal@sun.com 	 */
134510105Sadam.leventhal@sun.com 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
134610105Sadam.leventhal@sun.com 
134710105Sadam.leventhal@sun.com 	/*
134810105Sadam.leventhal@sun.com 	 * Invert the matrix.
134910105Sadam.leventhal@sun.com 	 */
135010105Sadam.leventhal@sun.com 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
135110105Sadam.leventhal@sun.com 	    invrows, used);
135210105Sadam.leventhal@sun.com 
135310105Sadam.leventhal@sun.com 	/*
135410105Sadam.leventhal@sun.com 	 * Reconstruct the missing data using the generated matrix.
135510105Sadam.leventhal@sun.com 	 */
135610105Sadam.leventhal@sun.com 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
135710105Sadam.leventhal@sun.com 	    invrows, used);
135810105Sadam.leventhal@sun.com 
135910105Sadam.leventhal@sun.com 	kmem_free(p, psize);
136010105Sadam.leventhal@sun.com 
136110105Sadam.leventhal@sun.com 	return (code);
136210105Sadam.leventhal@sun.com }
136310105Sadam.leventhal@sun.com 
136410105Sadam.leventhal@sun.com static int
vdev_raidz_reconstruct(raidz_map_t * rm,int * t,int nt)136510105Sadam.leventhal@sun.com vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
136610105Sadam.leventhal@sun.com {
136710105Sadam.leventhal@sun.com 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
136810105Sadam.leventhal@sun.com 	int ntgts;
136910105Sadam.leventhal@sun.com 	int i, c;
137010105Sadam.leventhal@sun.com 	int code;
137110105Sadam.leventhal@sun.com 	int nbadparity, nbaddata;
137210105Sadam.leventhal@sun.com 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
137310105Sadam.leventhal@sun.com 
137410105Sadam.leventhal@sun.com 	/*
137510105Sadam.leventhal@sun.com 	 * The tgts list must already be sorted.
137610105Sadam.leventhal@sun.com 	 */
137710105Sadam.leventhal@sun.com 	for (i = 1; i < nt; i++) {
137810105Sadam.leventhal@sun.com 		ASSERT(t[i] > t[i - 1]);
137910105Sadam.leventhal@sun.com 	}
138010105Sadam.leventhal@sun.com 
138110105Sadam.leventhal@sun.com 	nbadparity = rm->rm_firstdatacol;
138210105Sadam.leventhal@sun.com 	nbaddata = rm->rm_cols - nbadparity;
138310105Sadam.leventhal@sun.com 	ntgts = 0;
138410105Sadam.leventhal@sun.com 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
138510105Sadam.leventhal@sun.com 		if (c < rm->rm_firstdatacol)
138610105Sadam.leventhal@sun.com 			parity_valid[c] = B_FALSE;
138710105Sadam.leventhal@sun.com 
138810105Sadam.leventhal@sun.com 		if (i < nt && c == t[i]) {
138910105Sadam.leventhal@sun.com 			tgts[ntgts++] = c;
139010105Sadam.leventhal@sun.com 			i++;
139110105Sadam.leventhal@sun.com 		} else if (rm->rm_col[c].rc_error != 0) {
139210105Sadam.leventhal@sun.com 			tgts[ntgts++] = c;
139310105Sadam.leventhal@sun.com 		} else if (c >= rm->rm_firstdatacol) {
139410105Sadam.leventhal@sun.com 			nbaddata--;
139510105Sadam.leventhal@sun.com 		} else {
139610105Sadam.leventhal@sun.com 			parity_valid[c] = B_TRUE;
139710105Sadam.leventhal@sun.com 			nbadparity--;
139810105Sadam.leventhal@sun.com 		}
139910105Sadam.leventhal@sun.com 	}
140010105Sadam.leventhal@sun.com 
140110105Sadam.leventhal@sun.com 	ASSERT(ntgts >= nt);
140210105Sadam.leventhal@sun.com 	ASSERT(nbaddata >= 0);
140310105Sadam.leventhal@sun.com 	ASSERT(nbaddata + nbadparity == ntgts);
140410105Sadam.leventhal@sun.com 
140510105Sadam.leventhal@sun.com 	dt = &tgts[nbadparity];
140610105Sadam.leventhal@sun.com 
140710105Sadam.leventhal@sun.com 	/*
140810105Sadam.leventhal@sun.com 	 * See if we can use any of our optimized reconstruction routines.
140910105Sadam.leventhal@sun.com 	 */
141010105Sadam.leventhal@sun.com 	if (!vdev_raidz_default_to_general) {
141110105Sadam.leventhal@sun.com 		switch (nbaddata) {
141210105Sadam.leventhal@sun.com 		case 1:
141310105Sadam.leventhal@sun.com 			if (parity_valid[VDEV_RAIDZ_P])
141410105Sadam.leventhal@sun.com 				return (vdev_raidz_reconstruct_p(rm, dt, 1));
141510105Sadam.leventhal@sun.com 
141610105Sadam.leventhal@sun.com 			ASSERT(rm->rm_firstdatacol > 1);
141710105Sadam.leventhal@sun.com 
141810105Sadam.leventhal@sun.com 			if (parity_valid[VDEV_RAIDZ_Q])
141910105Sadam.leventhal@sun.com 				return (vdev_raidz_reconstruct_q(rm, dt, 1));
142010105Sadam.leventhal@sun.com 
142110105Sadam.leventhal@sun.com 			ASSERT(rm->rm_firstdatacol > 2);
142210105Sadam.leventhal@sun.com 			break;
142310105Sadam.leventhal@sun.com 
142410105Sadam.leventhal@sun.com 		case 2:
142510105Sadam.leventhal@sun.com 			ASSERT(rm->rm_firstdatacol > 1);
142610105Sadam.leventhal@sun.com 
142710105Sadam.leventhal@sun.com 			if (parity_valid[VDEV_RAIDZ_P] &&
142810105Sadam.leventhal@sun.com 			    parity_valid[VDEV_RAIDZ_Q])
142910105Sadam.leventhal@sun.com 				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
143010105Sadam.leventhal@sun.com 
143110105Sadam.leventhal@sun.com 			ASSERT(rm->rm_firstdatacol > 2);
143210105Sadam.leventhal@sun.com 
143310105Sadam.leventhal@sun.com 			break;
143410105Sadam.leventhal@sun.com 		}
143510105Sadam.leventhal@sun.com 	}
143610105Sadam.leventhal@sun.com 
143710105Sadam.leventhal@sun.com 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
143810105Sadam.leventhal@sun.com 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
143910105Sadam.leventhal@sun.com 	ASSERT(code > 0);
144010105Sadam.leventhal@sun.com 	return (code);
144110105Sadam.leventhal@sun.com }
14422082Seschrock 
1443789Sahrens static int
vdev_raidz_open(vdev_t * vd,uint64_t * asize,uint64_t * ashift)1444789Sahrens vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
1445789Sahrens {
144610105Sadam.leventhal@sun.com 	vdev_t *cvd;
14472082Seschrock 	uint64_t nparity = vd->vdev_nparity;
144810105Sadam.leventhal@sun.com 	int c;
1449789Sahrens 	int lasterror = 0;
1450789Sahrens 	int numerrors = 0;
1451789Sahrens 
14522082Seschrock 	ASSERT(nparity > 0);
14532082Seschrock 
14542082Seschrock 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
14552082Seschrock 	    vd->vdev_children < nparity + 1) {
1456789Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1457789Sahrens 		return (EINVAL);
1458789Sahrens 	}
1459789Sahrens 
14609846SEric.Taylor@Sun.COM 	vdev_open_children(vd);
1461789Sahrens 
146210105Sadam.leventhal@sun.com 	for (c = 0; c < vd->vdev_children; c++) {
146310105Sadam.leventhal@sun.com 		cvd = vd->vdev_child[c];
14649846SEric.Taylor@Sun.COM 
146510105Sadam.leventhal@sun.com 		if (cvd->vdev_open_error != 0) {
14669846SEric.Taylor@Sun.COM 			lasterror = cvd->vdev_open_error;
1467789Sahrens 			numerrors++;
1468789Sahrens 			continue;
1469789Sahrens 		}
1470789Sahrens 
1471789Sahrens 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
14721732Sbonwick 		*ashift = MAX(*ashift, cvd->vdev_ashift);
1473789Sahrens 	}
1474789Sahrens 
1475789Sahrens 	*asize *= vd->vdev_children;
1476789Sahrens 
14772082Seschrock 	if (numerrors > nparity) {
1478789Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1479789Sahrens 		return (lasterror);
1480789Sahrens 	}
1481789Sahrens 
1482789Sahrens 	return (0);
1483789Sahrens }
1484789Sahrens 
1485789Sahrens static void
vdev_raidz_close(vdev_t * vd)1486789Sahrens vdev_raidz_close(vdev_t *vd)
1487789Sahrens {
148810105Sadam.leventhal@sun.com 	int c;
148910105Sadam.leventhal@sun.com 
149010105Sadam.leventhal@sun.com 	for (c = 0; c < vd->vdev_children; c++)
1491789Sahrens 		vdev_close(vd->vdev_child[c]);
1492789Sahrens }
1493789Sahrens 
1494789Sahrens static uint64_t
vdev_raidz_asize(vdev_t * vd,uint64_t psize)1495789Sahrens vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1496789Sahrens {
1497789Sahrens 	uint64_t asize;
14981732Sbonwick 	uint64_t ashift = vd->vdev_top->vdev_ashift;
1499789Sahrens 	uint64_t cols = vd->vdev_children;
15002082Seschrock 	uint64_t nparity = vd->vdev_nparity;
1501789Sahrens 
15021732Sbonwick 	asize = ((psize - 1) >> ashift) + 1;
15032082Seschrock 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
15042082Seschrock 	asize = roundup(asize, nparity + 1) << ashift;
1505789Sahrens 
1506789Sahrens 	return (asize);
1507789Sahrens }
1508789Sahrens 
1509789Sahrens static void
vdev_raidz_child_done(zio_t * zio)1510789Sahrens vdev_raidz_child_done(zio_t *zio)
1511789Sahrens {
1512789Sahrens 	raidz_col_t *rc = zio->io_private;
1513789Sahrens 
1514789Sahrens 	rc->rc_error = zio->io_error;
1515789Sahrens 	rc->rc_tried = 1;
1516789Sahrens 	rc->rc_skipped = 0;
1517789Sahrens }
1518789Sahrens 
15195530Sbonwick static int
vdev_raidz_io_start(zio_t * zio)1520789Sahrens vdev_raidz_io_start(zio_t *zio)
1521789Sahrens {
1522789Sahrens 	vdev_t *vd = zio->io_vd;
15231732Sbonwick 	vdev_t *tvd = vd->vdev_top;
1524789Sahrens 	vdev_t *cvd;
1525789Sahrens 	raidz_map_t *rm;
1526789Sahrens 	raidz_col_t *rc;
152710105Sadam.leventhal@sun.com 	int c, i;
1528789Sahrens 
15292082Seschrock 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
15302082Seschrock 	    vd->vdev_nparity);
1531789Sahrens 
15321775Sbillm 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1533789Sahrens 
1534789Sahrens 	if (zio->io_type == ZIO_TYPE_WRITE) {
153510105Sadam.leventhal@sun.com 		vdev_raidz_generate_parity(rm);
1536789Sahrens 
1537789Sahrens 		for (c = 0; c < rm->rm_cols; c++) {
1538789Sahrens 			rc = &rm->rm_col[c];
15392082Seschrock 			cvd = vd->vdev_child[rc->rc_devidx];
1540789Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1541789Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
15427754SJeff.Bonwick@Sun.COM 			    zio->io_type, zio->io_priority, 0,
1543789Sahrens 			    vdev_raidz_child_done, rc));
1544789Sahrens 		}
15455530Sbonwick 
154610105Sadam.leventhal@sun.com 		/*
154710105Sadam.leventhal@sun.com 		 * Generate optional I/Os for any skipped sectors to improve
154810105Sadam.leventhal@sun.com 		 * aggregation contiguity.
154910105Sadam.leventhal@sun.com 		 */
155010450Sadam.leventhal@sun.com 		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
155110105Sadam.leventhal@sun.com 			ASSERT(c <= rm->rm_scols);
155210105Sadam.leventhal@sun.com 			if (c == rm->rm_scols)
155310105Sadam.leventhal@sun.com 				c = 0;
155410105Sadam.leventhal@sun.com 			rc = &rm->rm_col[c];
155510105Sadam.leventhal@sun.com 			cvd = vd->vdev_child[rc->rc_devidx];
155610105Sadam.leventhal@sun.com 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
155710105Sadam.leventhal@sun.com 			    rc->rc_offset + rc->rc_size, NULL,
155810105Sadam.leventhal@sun.com 			    1 << tvd->vdev_ashift,
155910105Sadam.leventhal@sun.com 			    zio->io_type, zio->io_priority,
156010105Sadam.leventhal@sun.com 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
156110105Sadam.leventhal@sun.com 		}
156210105Sadam.leventhal@sun.com 
15637754SJeff.Bonwick@Sun.COM 		return (ZIO_PIPELINE_CONTINUE);
1564789Sahrens 	}
1565789Sahrens 
1566789Sahrens 	ASSERT(zio->io_type == ZIO_TYPE_READ);
1567789Sahrens 
15682082Seschrock 	/*
15692082Seschrock 	 * Iterate over the columns in reverse order so that we hit the parity
157010105Sadam.leventhal@sun.com 	 * last -- any errors along the way will force us to read the parity.
15712082Seschrock 	 */
1572789Sahrens 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1573789Sahrens 		rc = &rm->rm_col[c];
15742082Seschrock 		cvd = vd->vdev_child[rc->rc_devidx];
15755329Sgw25295 		if (!vdev_readable(cvd)) {
15762082Seschrock 			if (c >= rm->rm_firstdatacol)
15772082Seschrock 				rm->rm_missingdata++;
15782082Seschrock 			else
15792082Seschrock 				rm->rm_missingparity++;
1580789Sahrens 			rc->rc_error = ENXIO;
1581789Sahrens 			rc->rc_tried = 1;	/* don't even try */
1582789Sahrens 			rc->rc_skipped = 1;
1583789Sahrens 			continue;
1584789Sahrens 		}
158510922SJeff.Bonwick@Sun.COM 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
15862082Seschrock 			if (c >= rm->rm_firstdatacol)
15872082Seschrock 				rm->rm_missingdata++;
15882082Seschrock 			else
15892082Seschrock 				rm->rm_missingparity++;
1590789Sahrens 			rc->rc_error = ESTALE;
1591789Sahrens 			rc->rc_skipped = 1;
1592789Sahrens 			continue;
1593789Sahrens 		}
15942082Seschrock 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
15959434SMark.Musante@Sun.COM 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1596789Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1597789Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
15987754SJeff.Bonwick@Sun.COM 			    zio->io_type, zio->io_priority, 0,
1599789Sahrens 			    vdev_raidz_child_done, rc));
1600789Sahrens 		}
1601789Sahrens 	}
1602789Sahrens 
16037754SJeff.Bonwick@Sun.COM 	return (ZIO_PIPELINE_CONTINUE);
1604789Sahrens }
1605789Sahrens 
1606*12296SLin.Ling@Sun.COM 
16071544Seschrock /*
16081544Seschrock  * Report a checksum error for a child of a RAID-Z device.
16091544Seschrock  */
16101544Seschrock static void
raidz_checksum_error(zio_t * zio,raidz_col_t * rc,void * bad_data)161110614SJonathan.Adams@Sun.COM raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
16121544Seschrock {
16132082Seschrock 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
16141544Seschrock 
16151544Seschrock 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
161610614SJonathan.Adams@Sun.COM 		zio_bad_cksum_t zbc;
161710614SJonathan.Adams@Sun.COM 		raidz_map_t *rm = zio->io_vsd;
161810614SJonathan.Adams@Sun.COM 
16191544Seschrock 		mutex_enter(&vd->vdev_stat_lock);
16201544Seschrock 		vd->vdev_stat.vs_checksum_errors++;
16211544Seschrock 		mutex_exit(&vd->vdev_stat_lock);
162210614SJonathan.Adams@Sun.COM 
162310614SJonathan.Adams@Sun.COM 		zbc.zbc_has_cksum = 0;
162410614SJonathan.Adams@Sun.COM 		zbc.zbc_injected = rm->rm_ecksuminjected;
162510614SJonathan.Adams@Sun.COM 
162610614SJonathan.Adams@Sun.COM 		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
162710614SJonathan.Adams@Sun.COM 		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
162810614SJonathan.Adams@Sun.COM 		    &zbc);
16291544Seschrock 	}
163010614SJonathan.Adams@Sun.COM }
16311544Seschrock 
163210614SJonathan.Adams@Sun.COM /*
163310614SJonathan.Adams@Sun.COM  * We keep track of whether or not there were any injected errors, so that
163410614SJonathan.Adams@Sun.COM  * any ereports we generate can note it.
163510614SJonathan.Adams@Sun.COM  */
163610614SJonathan.Adams@Sun.COM static int
raidz_checksum_verify(zio_t * zio)163710614SJonathan.Adams@Sun.COM raidz_checksum_verify(zio_t *zio)
163810614SJonathan.Adams@Sun.COM {
163910614SJonathan.Adams@Sun.COM 	zio_bad_cksum_t zbc;
164010614SJonathan.Adams@Sun.COM 	raidz_map_t *rm = zio->io_vsd;
164110614SJonathan.Adams@Sun.COM 
164210614SJonathan.Adams@Sun.COM 	int ret = zio_checksum_error(zio, &zbc);
164310614SJonathan.Adams@Sun.COM 	if (ret != 0 && zbc.zbc_injected != 0)
164410614SJonathan.Adams@Sun.COM 		rm->rm_ecksuminjected = 1;
164510614SJonathan.Adams@Sun.COM 
164610614SJonathan.Adams@Sun.COM 	return (ret);
16471544Seschrock }
16481544Seschrock 
16492082Seschrock /*
16502082Seschrock  * Generate the parity from the data columns. If we tried and were able to
16512082Seschrock  * read the parity without error, verify that the generated parity matches the
16522082Seschrock  * data we read. If it doesn't, we fire off a checksum error. Return the
16532082Seschrock  * number such failures.
16542082Seschrock  */
16552082Seschrock static int
raidz_parity_verify(zio_t * zio,raidz_map_t * rm)16562082Seschrock raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
16572082Seschrock {
16582082Seschrock 	void *orig[VDEV_RAIDZ_MAXPARITY];
16592082Seschrock 	int c, ret = 0;
16602082Seschrock 	raidz_col_t *rc;
16612082Seschrock 
16622082Seschrock 	for (c = 0; c < rm->rm_firstdatacol; c++) {
16632082Seschrock 		rc = &rm->rm_col[c];
16642082Seschrock 		if (!rc->rc_tried || rc->rc_error != 0)
16652082Seschrock 			continue;
16662082Seschrock 		orig[c] = zio_buf_alloc(rc->rc_size);
16672082Seschrock 		bcopy(rc->rc_data, orig[c], rc->rc_size);
16682082Seschrock 	}
16692082Seschrock 
167010105Sadam.leventhal@sun.com 	vdev_raidz_generate_parity(rm);
16712082Seschrock 
16722082Seschrock 	for (c = 0; c < rm->rm_firstdatacol; c++) {
16732082Seschrock 		rc = &rm->rm_col[c];
16742082Seschrock 		if (!rc->rc_tried || rc->rc_error != 0)
16752082Seschrock 			continue;
16762082Seschrock 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
167710614SJonathan.Adams@Sun.COM 			raidz_checksum_error(zio, rc, orig[c]);
16782082Seschrock 			rc->rc_error = ECKSUM;
16792082Seschrock 			ret++;
16802082Seschrock 		}
16812082Seschrock 		zio_buf_free(orig[c], rc->rc_size);
16822082Seschrock 	}
16832082Seschrock 
16842082Seschrock 	return (ret);
16852082Seschrock }
16862082Seschrock 
168710105Sadam.leventhal@sun.com /*
168810105Sadam.leventhal@sun.com  * Keep statistics on all the ways that we used parity to correct data.
168910105Sadam.leventhal@sun.com  */
169010105Sadam.leventhal@sun.com static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
16911544Seschrock 
16925530Sbonwick static int
vdev_raidz_worst_error(raidz_map_t * rm)16937754SJeff.Bonwick@Sun.COM vdev_raidz_worst_error(raidz_map_t *rm)
16947754SJeff.Bonwick@Sun.COM {
16957754SJeff.Bonwick@Sun.COM 	int error = 0;
16967754SJeff.Bonwick@Sun.COM 
16977754SJeff.Bonwick@Sun.COM 	for (int c = 0; c < rm->rm_cols; c++)
16987754SJeff.Bonwick@Sun.COM 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
16997754SJeff.Bonwick@Sun.COM 
17007754SJeff.Bonwick@Sun.COM 	return (error);
17017754SJeff.Bonwick@Sun.COM }
17027754SJeff.Bonwick@Sun.COM 
170310105Sadam.leventhal@sun.com /*
170410105Sadam.leventhal@sun.com  * Iterate over all combinations of bad data and attempt a reconstruction.
170510105Sadam.leventhal@sun.com  * Note that the algorithm below is non-optimal because it doesn't take into
170610105Sadam.leventhal@sun.com  * account how reconstruction is actually performed. For example, with
170710105Sadam.leventhal@sun.com  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
170810105Sadam.leventhal@sun.com  * is targeted as invalid as if columns 1 and 4 are targeted since in both
170910105Sadam.leventhal@sun.com  * cases we'd only use parity information in column 0.
171010105Sadam.leventhal@sun.com  */
171110105Sadam.leventhal@sun.com static int
vdev_raidz_combrec(zio_t * zio,int total_errors,int data_errors)171210105Sadam.leventhal@sun.com vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
171310105Sadam.leventhal@sun.com {
171410105Sadam.leventhal@sun.com 	raidz_map_t *rm = zio->io_vsd;
171510105Sadam.leventhal@sun.com 	raidz_col_t *rc;
171610105Sadam.leventhal@sun.com 	void *orig[VDEV_RAIDZ_MAXPARITY];
171710105Sadam.leventhal@sun.com 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
171810105Sadam.leventhal@sun.com 	int *tgts = &tstore[1];
171910105Sadam.leventhal@sun.com 	int current, next, i, c, n;
172010105Sadam.leventhal@sun.com 	int code, ret = 0;
172110105Sadam.leventhal@sun.com 
172210105Sadam.leventhal@sun.com 	ASSERT(total_errors < rm->rm_firstdatacol);
172310105Sadam.leventhal@sun.com 
172410105Sadam.leventhal@sun.com 	/*
172510105Sadam.leventhal@sun.com 	 * This simplifies one edge condition.
172610105Sadam.leventhal@sun.com 	 */
172710105Sadam.leventhal@sun.com 	tgts[-1] = -1;
172810105Sadam.leventhal@sun.com 
172910105Sadam.leventhal@sun.com 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
173010105Sadam.leventhal@sun.com 		/*
173110105Sadam.leventhal@sun.com 		 * Initialize the targets array by finding the first n columns
173210105Sadam.leventhal@sun.com 		 * that contain no error.
173310105Sadam.leventhal@sun.com 		 *
173410105Sadam.leventhal@sun.com 		 * If there were no data errors, we need to ensure that we're
173510105Sadam.leventhal@sun.com 		 * always explicitly attempting to reconstruct at least one
173610105Sadam.leventhal@sun.com 		 * data column. To do this, we simply push the highest target
173710105Sadam.leventhal@sun.com 		 * up into the data columns.
173810105Sadam.leventhal@sun.com 		 */
173910105Sadam.leventhal@sun.com 		for (c = 0, i = 0; i < n; i++) {
174010105Sadam.leventhal@sun.com 			if (i == n - 1 && data_errors == 0 &&
174110105Sadam.leventhal@sun.com 			    c < rm->rm_firstdatacol) {
174210105Sadam.leventhal@sun.com 				c = rm->rm_firstdatacol;
174310105Sadam.leventhal@sun.com 			}
174410105Sadam.leventhal@sun.com 
174510105Sadam.leventhal@sun.com 			while (rm->rm_col[c].rc_error != 0) {
174610105Sadam.leventhal@sun.com 				c++;
174710105Sadam.leventhal@sun.com 				ASSERT3S(c, <, rm->rm_cols);
174810105Sadam.leventhal@sun.com 			}
174910105Sadam.leventhal@sun.com 
175010105Sadam.leventhal@sun.com 			tgts[i] = c++;
175110105Sadam.leventhal@sun.com 		}
175210105Sadam.leventhal@sun.com 
175310105Sadam.leventhal@sun.com 		/*
175410105Sadam.leventhal@sun.com 		 * Setting tgts[n] simplifies the other edge condition.
175510105Sadam.leventhal@sun.com 		 */
175610105Sadam.leventhal@sun.com 		tgts[n] = rm->rm_cols;
175710105Sadam.leventhal@sun.com 
175810105Sadam.leventhal@sun.com 		/*
175910105Sadam.leventhal@sun.com 		 * These buffers were allocated in previous iterations.
176010105Sadam.leventhal@sun.com 		 */
176110105Sadam.leventhal@sun.com 		for (i = 0; i < n - 1; i++) {
176210105Sadam.leventhal@sun.com 			ASSERT(orig[i] != NULL);
176310105Sadam.leventhal@sun.com 		}
176410105Sadam.leventhal@sun.com 
176510105Sadam.leventhal@sun.com 		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
176610105Sadam.leventhal@sun.com 
176710105Sadam.leventhal@sun.com 		current = 0;
176810105Sadam.leventhal@sun.com 		next = tgts[current];
176910105Sadam.leventhal@sun.com 
177010105Sadam.leventhal@sun.com 		while (current != n) {
177110105Sadam.leventhal@sun.com 			tgts[current] = next;
177210105Sadam.leventhal@sun.com 			current = 0;
177310105Sadam.leventhal@sun.com 
177410105Sadam.leventhal@sun.com 			/*
177510105Sadam.leventhal@sun.com 			 * Save off the original data that we're going to
177610105Sadam.leventhal@sun.com 			 * attempt to reconstruct.
177710105Sadam.leventhal@sun.com 			 */
177810105Sadam.leventhal@sun.com 			for (i = 0; i < n; i++) {
177910105Sadam.leventhal@sun.com 				ASSERT(orig[i] != NULL);
178010105Sadam.leventhal@sun.com 				c = tgts[i];
178110105Sadam.leventhal@sun.com 				ASSERT3S(c, >=, 0);
178210105Sadam.leventhal@sun.com 				ASSERT3S(c, <, rm->rm_cols);
178310105Sadam.leventhal@sun.com 				rc = &rm->rm_col[c];
178410105Sadam.leventhal@sun.com 				bcopy(rc->rc_data, orig[i], rc->rc_size);
178510105Sadam.leventhal@sun.com 			}
178610105Sadam.leventhal@sun.com 
178710105Sadam.leventhal@sun.com 			/*
178810105Sadam.leventhal@sun.com 			 * Attempt a reconstruction and exit the outer loop on
178910105Sadam.leventhal@sun.com 			 * success.
179010105Sadam.leventhal@sun.com 			 */
179110105Sadam.leventhal@sun.com 			code = vdev_raidz_reconstruct(rm, tgts, n);
179210614SJonathan.Adams@Sun.COM 			if (raidz_checksum_verify(zio) == 0) {
179310105Sadam.leventhal@sun.com 				atomic_inc_64(&raidz_corrected[code]);
179410105Sadam.leventhal@sun.com 
179510105Sadam.leventhal@sun.com 				for (i = 0; i < n; i++) {
179610105Sadam.leventhal@sun.com 					c = tgts[i];
179710105Sadam.leventhal@sun.com 					rc = &rm->rm_col[c];
179810105Sadam.leventhal@sun.com 					ASSERT(rc->rc_error == 0);
179910614SJonathan.Adams@Sun.COM 					if (rc->rc_tried)
180010614SJonathan.Adams@Sun.COM 						raidz_checksum_error(zio, rc,
180110614SJonathan.Adams@Sun.COM 						    orig[i]);
180210105Sadam.leventhal@sun.com 					rc->rc_error = ECKSUM;
180310105Sadam.leventhal@sun.com 				}
180410105Sadam.leventhal@sun.com 
180510105Sadam.leventhal@sun.com 				ret = code;
180610105Sadam.leventhal@sun.com 				goto done;
180710105Sadam.leventhal@sun.com 			}
180810105Sadam.leventhal@sun.com 
180910105Sadam.leventhal@sun.com 			/*
181010105Sadam.leventhal@sun.com 			 * Restore the original data.
181110105Sadam.leventhal@sun.com 			 */
181210105Sadam.leventhal@sun.com 			for (i = 0; i < n; i++) {
181310105Sadam.leventhal@sun.com 				c = tgts[i];
181410105Sadam.leventhal@sun.com 				rc = &rm->rm_col[c];
181510105Sadam.leventhal@sun.com 				bcopy(orig[i], rc->rc_data, rc->rc_size);
181610105Sadam.leventhal@sun.com 			}
181710105Sadam.leventhal@sun.com 
181810105Sadam.leventhal@sun.com 			do {
181910105Sadam.leventhal@sun.com 				/*
182010105Sadam.leventhal@sun.com 				 * Find the next valid column after the current
182110105Sadam.leventhal@sun.com 				 * position..
182210105Sadam.leventhal@sun.com 				 */
182310105Sadam.leventhal@sun.com 				for (next = tgts[current] + 1;
182410105Sadam.leventhal@sun.com 				    next < rm->rm_cols &&
182510105Sadam.leventhal@sun.com 				    rm->rm_col[next].rc_error != 0; next++)
182610105Sadam.leventhal@sun.com 					continue;
182710105Sadam.leventhal@sun.com 
182810105Sadam.leventhal@sun.com 				ASSERT(next <= tgts[current + 1]);
182910105Sadam.leventhal@sun.com 
183010105Sadam.leventhal@sun.com 				/*
183110105Sadam.leventhal@sun.com 				 * If that spot is available, we're done here.
183210105Sadam.leventhal@sun.com 				 */
183310105Sadam.leventhal@sun.com 				if (next != tgts[current + 1])
183410105Sadam.leventhal@sun.com 					break;
183510105Sadam.leventhal@sun.com 
183610105Sadam.leventhal@sun.com 				/*
183710105Sadam.leventhal@sun.com 				 * Otherwise, find the next valid column after
183810105Sadam.leventhal@sun.com 				 * the previous position.
183910105Sadam.leventhal@sun.com 				 */
184010105Sadam.leventhal@sun.com 				for (c = tgts[current - 1] + 1;
184110105Sadam.leventhal@sun.com 				    rm->rm_col[c].rc_error != 0; c++)
184210105Sadam.leventhal@sun.com 					continue;
184310105Sadam.leventhal@sun.com 
184410105Sadam.leventhal@sun.com 				tgts[current] = c;
184510105Sadam.leventhal@sun.com 				current++;
184610105Sadam.leventhal@sun.com 
184710105Sadam.leventhal@sun.com 			} while (current != n);
184810105Sadam.leventhal@sun.com 		}
184910105Sadam.leventhal@sun.com 	}
185010105Sadam.leventhal@sun.com 	n--;
185110105Sadam.leventhal@sun.com done:
185210105Sadam.leventhal@sun.com 	for (i = 0; i < n; i++) {
185310105Sadam.leventhal@sun.com 		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
185410105Sadam.leventhal@sun.com 	}
185510105Sadam.leventhal@sun.com 
185610105Sadam.leventhal@sun.com 	return (ret);
185710105Sadam.leventhal@sun.com }
185810105Sadam.leventhal@sun.com 
18597754SJeff.Bonwick@Sun.COM static void
vdev_raidz_io_done(zio_t * zio)1860789Sahrens vdev_raidz_io_done(zio_t *zio)
1861789Sahrens {
1862789Sahrens 	vdev_t *vd = zio->io_vd;
1863789Sahrens 	vdev_t *cvd;
1864789Sahrens 	raidz_map_t *rm = zio->io_vsd;
186510105Sadam.leventhal@sun.com 	raidz_col_t *rc;
1866789Sahrens 	int unexpected_errors = 0;
18672082Seschrock 	int parity_errors = 0;
18683456Sahl 	int parity_untried = 0;
18692082Seschrock 	int data_errors = 0;
18707754SJeff.Bonwick@Sun.COM 	int total_errors = 0;
187110105Sadam.leventhal@sun.com 	int n, c;
187210105Sadam.leventhal@sun.com 	int tgts[VDEV_RAIDZ_MAXPARITY];
187310105Sadam.leventhal@sun.com 	int code;
1874789Sahrens 
18751775Sbillm 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
1876789Sahrens 
18772082Seschrock 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
18782082Seschrock 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
18792082Seschrock 
1880789Sahrens 	for (c = 0; c < rm->rm_cols; c++) {
1881789Sahrens 		rc = &rm->rm_col[c];
1882789Sahrens 
1883789Sahrens 		if (rc->rc_error) {
18847754SJeff.Bonwick@Sun.COM 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
18852082Seschrock 
18862082Seschrock 			if (c < rm->rm_firstdatacol)
18872082Seschrock 				parity_errors++;
18882082Seschrock 			else
18892082Seschrock 				data_errors++;
18902082Seschrock 
1891789Sahrens 			if (!rc->rc_skipped)
1892789Sahrens 				unexpected_errors++;
18932082Seschrock 
18947754SJeff.Bonwick@Sun.COM 			total_errors++;
18953456Sahl 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
18963456Sahl 			parity_untried++;
1897789Sahrens 		}
1898789Sahrens 	}
1899789Sahrens 
1900789Sahrens 	if (zio->io_type == ZIO_TYPE_WRITE) {
1901789Sahrens 		/*
19027754SJeff.Bonwick@Sun.COM 		 * XXX -- for now, treat partial writes as a success.
19037754SJeff.Bonwick@Sun.COM 		 * (If we couldn't write enough columns to reconstruct
19047754SJeff.Bonwick@Sun.COM 		 * the data, the I/O failed.  Otherwise, good enough.)
19057754SJeff.Bonwick@Sun.COM 		 *
19067754SJeff.Bonwick@Sun.COM 		 * Now that we support write reallocation, it would be better
19077754SJeff.Bonwick@Sun.COM 		 * to treat partial failure as real failure unless there are
19087754SJeff.Bonwick@Sun.COM 		 * no non-degraded top-level vdevs left, and not update DTLs
19097754SJeff.Bonwick@Sun.COM 		 * if we intend to reallocate.
1910789Sahrens 		 */
1911789Sahrens 		/* XXPOLICY */
19127754SJeff.Bonwick@Sun.COM 		if (total_errors > rm->rm_firstdatacol)
19137754SJeff.Bonwick@Sun.COM 			zio->io_error = vdev_raidz_worst_error(rm);
1914789Sahrens 
19157754SJeff.Bonwick@Sun.COM 		return;
1916789Sahrens 	}
1917789Sahrens 
1918789Sahrens 	ASSERT(zio->io_type == ZIO_TYPE_READ);
19192082Seschrock 	/*
19202082Seschrock 	 * There are three potential phases for a read:
19212082Seschrock 	 *	1. produce valid data from the columns read
19222082Seschrock 	 *	2. read all disks and try again
19232082Seschrock 	 *	3. perform combinatorial reconstruction
19242082Seschrock 	 *
19252082Seschrock 	 * Each phase is progressively both more expensive and less likely to
19262082Seschrock 	 * occur. If we encounter more errors than we can repair or all phases
19272082Seschrock 	 * fail, we have no choice but to return an error.
19282082Seschrock 	 */
1929789Sahrens 
1930789Sahrens 	/*
19312082Seschrock 	 * If the number of errors we saw was correctable -- less than or equal
19323456Sahl 	 * to the number of parity disks read -- attempt to produce data that
19333456Sahl 	 * has a valid checksum. Naturally, this case applies in the absence of
19343456Sahl 	 * any errors.
1935789Sahrens 	 */
19367754SJeff.Bonwick@Sun.COM 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
193710105Sadam.leventhal@sun.com 		if (data_errors == 0) {
193810614SJonathan.Adams@Sun.COM 			if (raidz_checksum_verify(zio) == 0) {
19394034Sahl 				/*
19404034Sahl 				 * If we read parity information (unnecessarily
19414034Sahl 				 * as it happens since no reconstruction was
19424034Sahl 				 * needed) regenerate and verify the parity.
19434034Sahl 				 * We also regenerate parity when resilvering
19444034Sahl 				 * so we can write it out to the failed device
19454034Sahl 				 * later.
19464034Sahl 				 */
19473456Sahl 				if (parity_errors + parity_untried <
19484034Sahl 				    rm->rm_firstdatacol ||
19494034Sahl 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
19503456Sahl 					n = raidz_parity_verify(zio, rm);
19513456Sahl 					unexpected_errors += n;
19523456Sahl 					ASSERT(parity_errors + n <=
19533456Sahl 					    rm->rm_firstdatacol);
19543456Sahl 				}
19552082Seschrock 				goto done;
19562082Seschrock 			}
195710105Sadam.leventhal@sun.com 		} else {
19583456Sahl 			/*
19593456Sahl 			 * We either attempt to read all the parity columns or
19603456Sahl 			 * none of them. If we didn't try to read parity, we
19613456Sahl 			 * wouldn't be here in the correctable case. There must
19623456Sahl 			 * also have been fewer parity errors than parity
19633456Sahl 			 * columns or, again, we wouldn't be in this code path.
19643456Sahl 			 */
19653456Sahl 			ASSERT(parity_untried == 0);
19662082Seschrock 			ASSERT(parity_errors < rm->rm_firstdatacol);
19672082Seschrock 
19682082Seschrock 			/*
196910105Sadam.leventhal@sun.com 			 * Identify the data columns that reported an error.
19702082Seschrock 			 */
197110105Sadam.leventhal@sun.com 			n = 0;
19722082Seschrock 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
19732082Seschrock 				rc = &rm->rm_col[c];
197410105Sadam.leventhal@sun.com 				if (rc->rc_error != 0) {
197510105Sadam.leventhal@sun.com 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
197610105Sadam.leventhal@sun.com 					tgts[n++] = c;
197710105Sadam.leventhal@sun.com 				}
19782082Seschrock 			}
19792082Seschrock 
198010105Sadam.leventhal@sun.com 			ASSERT(rm->rm_firstdatacol >= n);
198110105Sadam.leventhal@sun.com 
198210105Sadam.leventhal@sun.com 			code = vdev_raidz_reconstruct(rm, tgts, n);
19832082Seschrock 
198410614SJonathan.Adams@Sun.COM 			if (raidz_checksum_verify(zio) == 0) {
198510105Sadam.leventhal@sun.com 				atomic_inc_64(&raidz_corrected[code]);
1986789Sahrens 
19872082Seschrock 				/*
198810105Sadam.leventhal@sun.com 				 * If we read more parity disks than were used
198910105Sadam.leventhal@sun.com 				 * for reconstruction, confirm that the other
199010105Sadam.leventhal@sun.com 				 * parity disks produced correct data. This
199110105Sadam.leventhal@sun.com 				 * routine is suboptimal in that it regenerates
199210105Sadam.leventhal@sun.com 				 * the parity that we already used in addition
199310105Sadam.leventhal@sun.com 				 * to the parity that we're attempting to
199410105Sadam.leventhal@sun.com 				 * verify, but this should be a relatively
199510105Sadam.leventhal@sun.com 				 * uncommon case, and can be optimized if it
199610105Sadam.leventhal@sun.com 				 * becomes a problem. Note that we regenerate
199710105Sadam.leventhal@sun.com 				 * parity when resilvering so we can write it
199810105Sadam.leventhal@sun.com 				 * out to failed devices later.
19992082Seschrock 				 */
200010105Sadam.leventhal@sun.com 				if (parity_errors < rm->rm_firstdatacol - n ||
20014034Sahl 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
20022082Seschrock 					n = raidz_parity_verify(zio, rm);
20032082Seschrock 					unexpected_errors += n;
20042082Seschrock 					ASSERT(parity_errors + n <=
20052082Seschrock 					    rm->rm_firstdatacol);
20062082Seschrock 				}
20072082Seschrock 
20082082Seschrock 				goto done;
20092082Seschrock 			}
2010789Sahrens 		}
2011789Sahrens 	}
2012789Sahrens 
2013789Sahrens 	/*
20142082Seschrock 	 * This isn't a typical situation -- either we got a read error or
20152082Seschrock 	 * a child silently returned bad data. Read every block so we can
20162082Seschrock 	 * try again with as much data and parity as we can track down. If
20172082Seschrock 	 * we've already been through once before, all children will be marked
20182082Seschrock 	 * as tried so we'll proceed to combinatorial reconstruction.
2019789Sahrens 	 */
2020789Sahrens 	unexpected_errors = 1;
20212082Seschrock 	rm->rm_missingdata = 0;
20222082Seschrock 	rm->rm_missingparity = 0;
2023789Sahrens 
20242082Seschrock 	for (c = 0; c < rm->rm_cols; c++) {
20252082Seschrock 		if (rm->rm_col[c].rc_tried)
20262082Seschrock 			continue;
2027789Sahrens 
2028789Sahrens 		zio_vdev_io_redone(zio);
20292082Seschrock 		do {
2030789Sahrens 			rc = &rm->rm_col[c];
2031789Sahrens 			if (rc->rc_tried)
2032789Sahrens 				continue;
2033789Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL,
20342082Seschrock 			    vd->vdev_child[rc->rc_devidx],
2035789Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
20367754SJeff.Bonwick@Sun.COM 			    zio->io_type, zio->io_priority, 0,
2037789Sahrens 			    vdev_raidz_child_done, rc));
20382082Seschrock 		} while (++c < rm->rm_cols);
20395530Sbonwick 
20407754SJeff.Bonwick@Sun.COM 		return;
2041789Sahrens 	}
2042789Sahrens 
2043789Sahrens 	/*
20442082Seschrock 	 * At this point we've attempted to reconstruct the data given the
20452082Seschrock 	 * errors we detected, and we've attempted to read all columns. There
20462082Seschrock 	 * must, therefore, be one or more additional problems -- silent errors
20472082Seschrock 	 * resulting in invalid data rather than explicit I/O errors resulting
204810105Sadam.leventhal@sun.com 	 * in absent data. We check if there is enough additional data to
204910105Sadam.leventhal@sun.com 	 * possibly reconstruct the data and then perform combinatorial
205010105Sadam.leventhal@sun.com 	 * reconstruction over all possible combinations. If that fails,
205110105Sadam.leventhal@sun.com 	 * we're cooked.
2052789Sahrens 	 */
205310614SJonathan.Adams@Sun.COM 	if (total_errors > rm->rm_firstdatacol) {
20547754SJeff.Bonwick@Sun.COM 		zio->io_error = vdev_raidz_worst_error(rm);
20552082Seschrock 
205610614SJonathan.Adams@Sun.COM 	} else if (total_errors < rm->rm_firstdatacol &&
205710614SJonathan.Adams@Sun.COM 	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
20582082Seschrock 		/*
205910105Sadam.leventhal@sun.com 		 * If we didn't use all the available parity for the
206010105Sadam.leventhal@sun.com 		 * combinatorial reconstruction, verify that the remaining
206110105Sadam.leventhal@sun.com 		 * parity is correct.
20622082Seschrock 		 */
206310105Sadam.leventhal@sun.com 		if (code != (1 << rm->rm_firstdatacol) - 1)
206410105Sadam.leventhal@sun.com 			(void) raidz_parity_verify(zio, rm);
206510105Sadam.leventhal@sun.com 	} else {
206610105Sadam.leventhal@sun.com 		/*
206710614SJonathan.Adams@Sun.COM 		 * We're here because either:
206810614SJonathan.Adams@Sun.COM 		 *
206910614SJonathan.Adams@Sun.COM 		 *	total_errors == rm_first_datacol, or
207010614SJonathan.Adams@Sun.COM 		 *	vdev_raidz_combrec() failed
207110614SJonathan.Adams@Sun.COM 		 *
207210614SJonathan.Adams@Sun.COM 		 * In either case, there is enough bad data to prevent
207310614SJonathan.Adams@Sun.COM 		 * reconstruction.
207410614SJonathan.Adams@Sun.COM 		 *
207510614SJonathan.Adams@Sun.COM 		 * Start checksum ereports for all children which haven't
207611670SNeil.Perrin@Sun.COM 		 * failed, and the IO wasn't speculative.
207710105Sadam.leventhal@sun.com 		 */
207810105Sadam.leventhal@sun.com 		zio->io_error = ECKSUM;
20792082Seschrock 
208011670SNeil.Perrin@Sun.COM 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
208111670SNeil.Perrin@Sun.COM 			for (c = 0; c < rm->rm_cols; c++) {
208211670SNeil.Perrin@Sun.COM 				rc = &rm->rm_col[c];
208311670SNeil.Perrin@Sun.COM 				if (rc->rc_error == 0) {
208411670SNeil.Perrin@Sun.COM 					zio_bad_cksum_t zbc;
208511670SNeil.Perrin@Sun.COM 					zbc.zbc_has_cksum = 0;
208611670SNeil.Perrin@Sun.COM 					zbc.zbc_injected =
208711670SNeil.Perrin@Sun.COM 					    rm->rm_ecksuminjected;
208810614SJonathan.Adams@Sun.COM 
208911670SNeil.Perrin@Sun.COM 					zfs_ereport_start_checksum(
209011670SNeil.Perrin@Sun.COM 					    zio->io_spa,
209111670SNeil.Perrin@Sun.COM 					    vd->vdev_child[rc->rc_devidx],
209211670SNeil.Perrin@Sun.COM 					    zio, rc->rc_offset, rc->rc_size,
209311670SNeil.Perrin@Sun.COM 					    (void *)(uintptr_t)c, &zbc);
209411670SNeil.Perrin@Sun.COM 				}
20952082Seschrock 			}
20961544Seschrock 		}
20971544Seschrock 	}
2098789Sahrens 
2099789Sahrens done:
2100789Sahrens 	zio_checksum_verified(zio);
2101789Sahrens 
21028241SJeff.Bonwick@Sun.COM 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2103789Sahrens 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2104789Sahrens 		/*
2105789Sahrens 		 * Use the good data we have in hand to repair damaged children.
2106789Sahrens 		 */
2107789Sahrens 		for (c = 0; c < rm->rm_cols; c++) {
2108789Sahrens 			rc = &rm->rm_col[c];
21092082Seschrock 			cvd = vd->vdev_child[rc->rc_devidx];
2110789Sahrens 
21111732Sbonwick 			if (rc->rc_error == 0)
21121732Sbonwick 				continue;
21131732Sbonwick 
21147754SJeff.Bonwick@Sun.COM 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
21151732Sbonwick 			    rc->rc_offset, rc->rc_data, rc->rc_size,
21161732Sbonwick 			    ZIO_TYPE_WRITE, zio->io_priority,
21178241SJeff.Bonwick@Sun.COM 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
21188241SJeff.Bonwick@Sun.COM 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
21191732Sbonwick 		}
2120789Sahrens 	}
2121789Sahrens }
2122789Sahrens 
2123789Sahrens static void
vdev_raidz_state_change(vdev_t * vd,int faulted,int degraded)2124789Sahrens vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2125789Sahrens {
21262082Seschrock 	if (faulted > vd->vdev_nparity)
21271544Seschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
21281544Seschrock 		    VDEV_AUX_NO_REPLICAS);
2129789Sahrens 	else if (degraded + faulted != 0)
21301544Seschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2131789Sahrens 	else
21321544Seschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2133789Sahrens }
2134789Sahrens 
2135789Sahrens vdev_ops_t vdev_raidz_ops = {
2136789Sahrens 	vdev_raidz_open,
2137789Sahrens 	vdev_raidz_close,
2138789Sahrens 	vdev_raidz_asize,
2139789Sahrens 	vdev_raidz_io_start,
2140789Sahrens 	vdev_raidz_io_done,
2141789Sahrens 	vdev_raidz_state_change,
214211958SGeorge.Wilson@Sun.COM 	NULL,
214311958SGeorge.Wilson@Sun.COM 	NULL,
2144789Sahrens 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2145789Sahrens 	B_FALSE			/* not a leaf vdev */
2146789Sahrens };
2147