1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 212082Seschrock 22789Sahrens /* 239434SMark.Musante@Sun.COM * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24789Sahrens * Use is subject to license terms. 25789Sahrens */ 26789Sahrens 27789Sahrens #include <sys/zfs_context.h> 28789Sahrens #include <sys/spa.h> 29789Sahrens #include <sys/vdev_impl.h> 30789Sahrens #include <sys/zio.h> 31789Sahrens #include <sys/zio_checksum.h> 32789Sahrens #include <sys/fs/zfs.h> 331544Seschrock #include <sys/fm/fs/zfs.h> 34789Sahrens 35789Sahrens /* 36789Sahrens * Virtual device vector for RAID-Z. 372082Seschrock * 38*10105Sadam.leventhal@sun.com * This vdev supports single, double, and triple parity. For single parity, 39*10105Sadam.leventhal@sun.com * we use a simple XOR of all the data columns. For double or triple parity, 40*10105Sadam.leventhal@sun.com * we use a special case of Reed-Solomon coding. This extends the 41*10105Sadam.leventhal@sun.com * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 42*10105Sadam.leventhal@sun.com * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 43*10105Sadam.leventhal@sun.com * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 44*10105Sadam.leventhal@sun.com * former is also based. The latter is designed to provide higher performance 45*10105Sadam.leventhal@sun.com * for writes. 46*10105Sadam.leventhal@sun.com * 47*10105Sadam.leventhal@sun.com * Note that the Plank paper claimed to support arbitrary N+M, but was then 48*10105Sadam.leventhal@sun.com * amended six years later identifying a critical flaw that invalidates its 49*10105Sadam.leventhal@sun.com * claims. Nevertheless, the technique can be adapted to work for up to 50*10105Sadam.leventhal@sun.com * triple parity. For additional parity, the amendment "Note: Correction to 51*10105Sadam.leventhal@sun.com * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 52*10105Sadam.leventhal@sun.com * is viable, but the additional complexity means that write performance will 53*10105Sadam.leventhal@sun.com * suffer. 54*10105Sadam.leventhal@sun.com * 55*10105Sadam.leventhal@sun.com * All of the methods above operate on a Galois field, defined over the 56*10105Sadam.leventhal@sun.com * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 57*10105Sadam.leventhal@sun.com * can be expressed with a single byte. Briefly, the operations on the 58*10105Sadam.leventhal@sun.com * field are defined as follows: 592082Seschrock * 602082Seschrock * o addition (+) is represented by a bitwise XOR 612082Seschrock * o subtraction (-) is therefore identical to addition: A + B = A - B 622082Seschrock * o multiplication of A by 2 is defined by the following bitwise expression: 632082Seschrock * (A * 2)_7 = A_6 642082Seschrock * (A * 2)_6 = A_5 652082Seschrock * (A * 2)_5 = A_4 662082Seschrock * (A * 2)_4 = A_3 + A_7 672082Seschrock * (A * 2)_3 = A_2 + A_7 682082Seschrock * (A * 2)_2 = A_1 + A_7 692082Seschrock * (A * 2)_1 = A_0 702082Seschrock * (A * 2)_0 = A_7 712082Seschrock * 722082Seschrock * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 73*10105Sadam.leventhal@sun.com * As an aside, this multiplication is derived from the error correcting 74*10105Sadam.leventhal@sun.com * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 752082Seschrock * 762082Seschrock * Observe that any number in the field (except for 0) can be expressed as a 772082Seschrock * power of 2 -- a generator for the field. We store a table of the powers of 782082Seschrock * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 792082Seschrock * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 80*10105Sadam.leventhal@sun.com * than field addition). The inverse of a field element A (A^-1) is therefore 81*10105Sadam.leventhal@sun.com * A ^ (255 - 1) = A^254. 822082Seschrock * 83*10105Sadam.leventhal@sun.com * The up-to-three parity columns, P, Q, R over several data columns, 84*10105Sadam.leventhal@sun.com * D_0, ... D_n-1, can be expressed by field operations: 852082Seschrock * 862082Seschrock * P = D_0 + D_1 + ... + D_n-2 + D_n-1 872082Seschrock * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 882082Seschrock * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 89*10105Sadam.leventhal@sun.com * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 90*10105Sadam.leventhal@sun.com * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 912082Seschrock * 92*10105Sadam.leventhal@sun.com * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 93*10105Sadam.leventhal@sun.com * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 94*10105Sadam.leventhal@sun.com * independent coefficients. (There are no additional coefficients that have 95*10105Sadam.leventhal@sun.com * this property which is why the uncorrected Plank method breaks down.) 96*10105Sadam.leventhal@sun.com * 97*10105Sadam.leventhal@sun.com * See the reconstruction code below for how P, Q and R can used individually 98*10105Sadam.leventhal@sun.com * or in concert to recover missing data columns. 99789Sahrens */ 100789Sahrens 101789Sahrens typedef struct raidz_col { 1022082Seschrock uint64_t rc_devidx; /* child device index for I/O */ 1032082Seschrock uint64_t rc_offset; /* device offset */ 1042082Seschrock uint64_t rc_size; /* I/O size */ 1052082Seschrock void *rc_data; /* I/O data */ 1062082Seschrock int rc_error; /* I/O error for this device */ 1072082Seschrock uint8_t rc_tried; /* Did we attempt this I/O column? */ 1082082Seschrock uint8_t rc_skipped; /* Did we skip this I/O column? */ 109789Sahrens } raidz_col_t; 110789Sahrens 111789Sahrens typedef struct raidz_map { 112*10105Sadam.leventhal@sun.com uint64_t rm_cols; /* Regular column count */ 113*10105Sadam.leventhal@sun.com uint64_t rm_scols; /* Count including skipped columns */ 1142082Seschrock uint64_t rm_bigcols; /* Number of oversized columns */ 1152082Seschrock uint64_t rm_asize; /* Actual total I/O size */ 1162082Seschrock uint64_t rm_missingdata; /* Count of missing data devices */ 1172082Seschrock uint64_t rm_missingparity; /* Count of missing parity devices */ 1182082Seschrock uint64_t rm_firstdatacol; /* First data column/parity count */ 119*10105Sadam.leventhal@sun.com uint64_t rm_skipped; /* Skipped sectors for padding */ 1202082Seschrock raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 121789Sahrens } raidz_map_t; 122789Sahrens 1232082Seschrock #define VDEV_RAIDZ_P 0 1242082Seschrock #define VDEV_RAIDZ_Q 1 125*10105Sadam.leventhal@sun.com #define VDEV_RAIDZ_R 2 126*10105Sadam.leventhal@sun.com #define VDEV_RAIDZ_MAXPARITY 3 1272082Seschrock 128*10105Sadam.leventhal@sun.com #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 129*10105Sadam.leventhal@sun.com #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 1302082Seschrock 131*10105Sadam.leventhal@sun.com /* 132*10105Sadam.leventhal@sun.com * We provide a mechanism to perform the field multiplication operation on a 133*10105Sadam.leventhal@sun.com * 64-bit value all at once rather than a byte at a time. This works by 134*10105Sadam.leventhal@sun.com * creating a mask from the top bit in each byte and using that to 135*10105Sadam.leventhal@sun.com * conditionally apply the XOR of 0x1d. 136*10105Sadam.leventhal@sun.com */ 137*10105Sadam.leventhal@sun.com #define VDEV_RAIDZ_64MUL_2(x, mask) \ 138*10105Sadam.leventhal@sun.com { \ 139*10105Sadam.leventhal@sun.com (mask) = (x) & 0x8080808080808080ULL; \ 140*10105Sadam.leventhal@sun.com (mask) = ((mask) << 1) - ((mask) >> 7); \ 141*10105Sadam.leventhal@sun.com (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 142*10105Sadam.leventhal@sun.com ((mask) & 0x1d1d1d1d1d1d1d1d); \ 143*10105Sadam.leventhal@sun.com } 144*10105Sadam.leventhal@sun.com 145*10105Sadam.leventhal@sun.com #define VDEV_RAIDZ_64MUL_4(x, mask) \ 146*10105Sadam.leventhal@sun.com { \ 147*10105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2((x), mask); \ 148*10105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2((x), mask); \ 149*10105Sadam.leventhal@sun.com } 150*10105Sadam.leventhal@sun.com 151*10105Sadam.leventhal@sun.com /* 152*10105Sadam.leventhal@sun.com * Force reconstruction to use the general purpose method. 153*10105Sadam.leventhal@sun.com */ 154*10105Sadam.leventhal@sun.com int vdev_raidz_default_to_general; 1552082Seschrock 1562082Seschrock /* 1572082Seschrock * These two tables represent powers and logs of 2 in the Galois field defined 1582082Seschrock * above. These values were computed by repeatedly multiplying by 2 as above. 1592082Seschrock */ 1602082Seschrock static const uint8_t vdev_raidz_pow2[256] = { 1612082Seschrock 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 1622082Seschrock 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 1632082Seschrock 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 1642082Seschrock 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 1652082Seschrock 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 1662082Seschrock 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 1672082Seschrock 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 1682082Seschrock 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 1692082Seschrock 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 1702082Seschrock 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 1712082Seschrock 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 1722082Seschrock 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 1732082Seschrock 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 1742082Seschrock 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 1752082Seschrock 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 1762082Seschrock 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 1772082Seschrock 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 1782082Seschrock 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 1792082Seschrock 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 1802082Seschrock 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 1812082Seschrock 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 1822082Seschrock 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 1832082Seschrock 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 1842082Seschrock 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 1852082Seschrock 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 1862082Seschrock 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 1872082Seschrock 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 1882082Seschrock 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 1892082Seschrock 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 1902082Seschrock 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 1912082Seschrock 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 1922082Seschrock 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 1932082Seschrock }; 1942082Seschrock static const uint8_t vdev_raidz_log2[256] = { 1952082Seschrock 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 1962082Seschrock 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 1972082Seschrock 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 1982082Seschrock 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 1992082Seschrock 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 2002082Seschrock 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 2012082Seschrock 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 2022082Seschrock 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 2032082Seschrock 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 2042082Seschrock 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 2052082Seschrock 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 2062082Seschrock 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 2072082Seschrock 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 2082082Seschrock 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 2092082Seschrock 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 2102082Seschrock 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 2112082Seschrock 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 2122082Seschrock 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 2132082Seschrock 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 2142082Seschrock 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 2152082Seschrock 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 2162082Seschrock 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 2172082Seschrock 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 2182082Seschrock 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 2192082Seschrock 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 2202082Seschrock 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 2212082Seschrock 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 2222082Seschrock 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 2232082Seschrock 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 2242082Seschrock 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 2252082Seschrock 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 2262082Seschrock 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 2272082Seschrock }; 2282082Seschrock 2292082Seschrock /* 2302082Seschrock * Multiply a given number by 2 raised to the given power. 2312082Seschrock */ 2322082Seschrock static uint8_t 2332082Seschrock vdev_raidz_exp2(uint_t a, int exp) 2342082Seschrock { 2352082Seschrock if (a == 0) 2362082Seschrock return (0); 2372082Seschrock 2382082Seschrock ASSERT(exp >= 0); 2392082Seschrock ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 2402082Seschrock 2412082Seschrock exp += vdev_raidz_log2[a]; 2422082Seschrock if (exp > 255) 2432082Seschrock exp -= 255; 2442082Seschrock 2452082Seschrock return (vdev_raidz_pow2[exp]); 2462082Seschrock } 2472082Seschrock 2487754SJeff.Bonwick@Sun.COM static void 2497754SJeff.Bonwick@Sun.COM vdev_raidz_map_free(zio_t *zio) 2507754SJeff.Bonwick@Sun.COM { 2517754SJeff.Bonwick@Sun.COM raidz_map_t *rm = zio->io_vsd; 2527754SJeff.Bonwick@Sun.COM int c; 2537754SJeff.Bonwick@Sun.COM 2547754SJeff.Bonwick@Sun.COM for (c = 0; c < rm->rm_firstdatacol; c++) 2557754SJeff.Bonwick@Sun.COM zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 2567754SJeff.Bonwick@Sun.COM 257*10105Sadam.leventhal@sun.com kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 2587754SJeff.Bonwick@Sun.COM } 2597754SJeff.Bonwick@Sun.COM 260789Sahrens static raidz_map_t * 2612082Seschrock vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, 2622082Seschrock uint64_t nparity) 263789Sahrens { 264789Sahrens raidz_map_t *rm; 265789Sahrens uint64_t b = zio->io_offset >> unit_shift; 266789Sahrens uint64_t s = zio->io_size >> unit_shift; 267789Sahrens uint64_t f = b % dcols; 268789Sahrens uint64_t o = (b / dcols) << unit_shift; 269*10105Sadam.leventhal@sun.com uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 270789Sahrens 2712082Seschrock q = s / (dcols - nparity); 2722082Seschrock r = s - q * (dcols - nparity); 2732082Seschrock bc = (r == 0 ? 0 : r + nparity); 274*10105Sadam.leventhal@sun.com tot = s + nparity * (q + (r == 0 ? 0 : 1)); 275789Sahrens 276*10105Sadam.leventhal@sun.com if (q == 0) { 277*10105Sadam.leventhal@sun.com acols = bc; 278*10105Sadam.leventhal@sun.com scols = MIN(dcols, roundup(bc, nparity + 1)); 279*10105Sadam.leventhal@sun.com } else { 280*10105Sadam.leventhal@sun.com acols = dcols; 281*10105Sadam.leventhal@sun.com scols = dcols; 282*10105Sadam.leventhal@sun.com } 283789Sahrens 284*10105Sadam.leventhal@sun.com ASSERT3U(acols, <=, scols); 285*10105Sadam.leventhal@sun.com 286*10105Sadam.leventhal@sun.com rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 287789Sahrens 288789Sahrens rm->rm_cols = acols; 289*10105Sadam.leventhal@sun.com rm->rm_scols = scols; 290789Sahrens rm->rm_bigcols = bc; 2912082Seschrock rm->rm_missingdata = 0; 2922082Seschrock rm->rm_missingparity = 0; 2932082Seschrock rm->rm_firstdatacol = nparity; 294789Sahrens 295*10105Sadam.leventhal@sun.com asize = 0; 296*10105Sadam.leventhal@sun.com 297*10105Sadam.leventhal@sun.com for (c = 0; c < scols; c++) { 298789Sahrens col = f + c; 299789Sahrens coff = o; 300789Sahrens if (col >= dcols) { 301789Sahrens col -= dcols; 302789Sahrens coff += 1ULL << unit_shift; 303789Sahrens } 3042082Seschrock rm->rm_col[c].rc_devidx = col; 305789Sahrens rm->rm_col[c].rc_offset = coff; 306789Sahrens rm->rm_col[c].rc_data = NULL; 307789Sahrens rm->rm_col[c].rc_error = 0; 308789Sahrens rm->rm_col[c].rc_tried = 0; 309789Sahrens rm->rm_col[c].rc_skipped = 0; 310*10105Sadam.leventhal@sun.com 311*10105Sadam.leventhal@sun.com if (c >= acols) 312*10105Sadam.leventhal@sun.com rm->rm_col[c].rc_size = 0; 313*10105Sadam.leventhal@sun.com else if (c < bc) 314*10105Sadam.leventhal@sun.com rm->rm_col[c].rc_size = (q + 1) << unit_shift; 315*10105Sadam.leventhal@sun.com else 316*10105Sadam.leventhal@sun.com rm->rm_col[c].rc_size = q << unit_shift; 317*10105Sadam.leventhal@sun.com 318*10105Sadam.leventhal@sun.com asize += rm->rm_col[c].rc_size; 319789Sahrens } 320789Sahrens 321*10105Sadam.leventhal@sun.com ASSERT3U(asize, ==, tot << unit_shift); 322*10105Sadam.leventhal@sun.com rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 323*10105Sadam.leventhal@sun.com rm->rm_skipped = roundup(tot, nparity + 1) - tot; 324*10105Sadam.leventhal@sun.com ASSERT3U(rm->rm_asize - asize, ==, rm->rm_skipped << unit_shift); 325*10105Sadam.leventhal@sun.com ASSERT3U(rm->rm_skipped, <=, nparity); 326789Sahrens 327789Sahrens for (c = 0; c < rm->rm_firstdatacol; c++) 328789Sahrens rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); 329789Sahrens 330789Sahrens rm->rm_col[c].rc_data = zio->io_data; 331789Sahrens 332789Sahrens for (c = c + 1; c < acols; c++) 333789Sahrens rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 334789Sahrens rm->rm_col[c - 1].rc_size; 335789Sahrens 3361133Seschrock /* 3372082Seschrock * If all data stored spans all columns, there's a danger that parity 3382082Seschrock * will always be on the same device and, since parity isn't read 3392082Seschrock * during normal operation, that that device's I/O bandwidth won't be 3402082Seschrock * used effectively. We therefore switch the parity every 1MB. 3412082Seschrock * 3422082Seschrock * ... at least that was, ostensibly, the theory. As a practical 3432082Seschrock * matter unless we juggle the parity between all devices evenly, we 3442082Seschrock * won't see any benefit. Further, occasional writes that aren't a 3452082Seschrock * multiple of the LCM of the number of children and the minimum 3462082Seschrock * stripe width are sufficient to avoid pessimal behavior. 3472082Seschrock * Unfortunately, this decision created an implicit on-disk format 3483456Sahl * requirement that we need to support for all eternity, but only 3493456Sahl * for single-parity RAID-Z. 3501133Seschrock */ 3511133Seschrock ASSERT(rm->rm_cols >= 2); 3521133Seschrock ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 353789Sahrens 3542082Seschrock if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 3552082Seschrock devidx = rm->rm_col[0].rc_devidx; 3561133Seschrock o = rm->rm_col[0].rc_offset; 3572082Seschrock rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 3581133Seschrock rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 3592082Seschrock rm->rm_col[1].rc_devidx = devidx; 3601133Seschrock rm->rm_col[1].rc_offset = o; 361789Sahrens } 362789Sahrens 363789Sahrens zio->io_vsd = rm; 3647754SJeff.Bonwick@Sun.COM zio->io_vsd_free = vdev_raidz_map_free; 365789Sahrens return (rm); 366789Sahrens } 367789Sahrens 368789Sahrens static void 3692082Seschrock vdev_raidz_generate_parity_p(raidz_map_t *rm) 3702082Seschrock { 3712082Seschrock uint64_t *p, *src, pcount, ccount, i; 3722082Seschrock int c; 3732082Seschrock 3742082Seschrock pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 3752082Seschrock 3762082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 3772082Seschrock src = rm->rm_col[c].rc_data; 3782082Seschrock p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 3792082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 3802082Seschrock 3812082Seschrock if (c == rm->rm_firstdatacol) { 3822082Seschrock ASSERT(ccount == pcount); 383*10105Sadam.leventhal@sun.com for (i = 0; i < ccount; i++, src++, p++) { 3842082Seschrock *p = *src; 3852082Seschrock } 3862082Seschrock } else { 3872082Seschrock ASSERT(ccount <= pcount); 388*10105Sadam.leventhal@sun.com for (i = 0; i < ccount; i++, src++, p++) { 3892082Seschrock *p ^= *src; 3902082Seschrock } 3912082Seschrock } 3922082Seschrock } 3932082Seschrock } 3942082Seschrock 3952082Seschrock static void 3962082Seschrock vdev_raidz_generate_parity_pq(raidz_map_t *rm) 397789Sahrens { 398*10105Sadam.leventhal@sun.com uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 3992082Seschrock int c; 4002082Seschrock 401*10105Sadam.leventhal@sun.com pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 4022082Seschrock ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 4032082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_size); 4042082Seschrock 4052082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 4062082Seschrock src = rm->rm_col[c].rc_data; 4072082Seschrock p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 4082082Seschrock q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 409*10105Sadam.leventhal@sun.com 410*10105Sadam.leventhal@sun.com ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 4112082Seschrock 4122082Seschrock if (c == rm->rm_firstdatacol) { 413*10105Sadam.leventhal@sun.com ASSERT(ccnt == pcnt || ccnt == 0); 414*10105Sadam.leventhal@sun.com for (i = 0; i < ccnt; i++, src++, p++, q++) { 4152082Seschrock *p = *src; 416*10105Sadam.leventhal@sun.com *q = *src; 4172082Seschrock } 418*10105Sadam.leventhal@sun.com for (; i < pcnt; i++, src++, p++, q++) { 419*10105Sadam.leventhal@sun.com *p = 0; 4202082Seschrock *q = 0; 4212082Seschrock } 4222082Seschrock } else { 423*10105Sadam.leventhal@sun.com ASSERT(ccnt <= pcnt); 424789Sahrens 4252082Seschrock /* 426*10105Sadam.leventhal@sun.com * Apply the algorithm described above by multiplying 427*10105Sadam.leventhal@sun.com * the previous result and adding in the new value. 4282082Seschrock */ 429*10105Sadam.leventhal@sun.com for (i = 0; i < ccnt; i++, src++, p++, q++) { 430*10105Sadam.leventhal@sun.com *p ^= *src; 431*10105Sadam.leventhal@sun.com 432*10105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*q, mask); 4332082Seschrock *q ^= *src; 4342082Seschrock } 4352082Seschrock 4362082Seschrock /* 4372082Seschrock * Treat short columns as though they are full of 0s. 438*10105Sadam.leventhal@sun.com * Note that there's therefore nothing needed for P. 4392082Seschrock */ 440*10105Sadam.leventhal@sun.com for (; i < pcnt; i++, q++) { 441*10105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*q, mask); 4422082Seschrock } 4432082Seschrock } 4442082Seschrock } 4452082Seschrock } 4462082Seschrock 4472082Seschrock static void 448*10105Sadam.leventhal@sun.com vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 449*10105Sadam.leventhal@sun.com { 450*10105Sadam.leventhal@sun.com uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 451*10105Sadam.leventhal@sun.com int c; 452*10105Sadam.leventhal@sun.com 453*10105Sadam.leventhal@sun.com pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 454*10105Sadam.leventhal@sun.com ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 455*10105Sadam.leventhal@sun.com rm->rm_col[VDEV_RAIDZ_Q].rc_size); 456*10105Sadam.leventhal@sun.com ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 457*10105Sadam.leventhal@sun.com rm->rm_col[VDEV_RAIDZ_R].rc_size); 458*10105Sadam.leventhal@sun.com 459*10105Sadam.leventhal@sun.com for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 460*10105Sadam.leventhal@sun.com src = rm->rm_col[c].rc_data; 461*10105Sadam.leventhal@sun.com p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 462*10105Sadam.leventhal@sun.com q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 463*10105Sadam.leventhal@sun.com r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 464*10105Sadam.leventhal@sun.com 465*10105Sadam.leventhal@sun.com ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 466*10105Sadam.leventhal@sun.com 467*10105Sadam.leventhal@sun.com if (c == rm->rm_firstdatacol) { 468*10105Sadam.leventhal@sun.com ASSERT(ccnt == pcnt || ccnt == 0); 469*10105Sadam.leventhal@sun.com for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 470*10105Sadam.leventhal@sun.com *p = *src; 471*10105Sadam.leventhal@sun.com *q = *src; 472*10105Sadam.leventhal@sun.com *r = *src; 473*10105Sadam.leventhal@sun.com } 474*10105Sadam.leventhal@sun.com for (; i < pcnt; i++, src++, p++, q++, r++) { 475*10105Sadam.leventhal@sun.com *p = 0; 476*10105Sadam.leventhal@sun.com *q = 0; 477*10105Sadam.leventhal@sun.com *r = 0; 478*10105Sadam.leventhal@sun.com } 479*10105Sadam.leventhal@sun.com } else { 480*10105Sadam.leventhal@sun.com ASSERT(ccnt <= pcnt); 481*10105Sadam.leventhal@sun.com 482*10105Sadam.leventhal@sun.com /* 483*10105Sadam.leventhal@sun.com * Apply the algorithm described above by multiplying 484*10105Sadam.leventhal@sun.com * the previous result and adding in the new value. 485*10105Sadam.leventhal@sun.com */ 486*10105Sadam.leventhal@sun.com for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 487*10105Sadam.leventhal@sun.com *p ^= *src; 488*10105Sadam.leventhal@sun.com 489*10105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*q, mask); 490*10105Sadam.leventhal@sun.com *q ^= *src; 491*10105Sadam.leventhal@sun.com 492*10105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_4(*r, mask); 493*10105Sadam.leventhal@sun.com *r ^= *src; 494*10105Sadam.leventhal@sun.com } 495*10105Sadam.leventhal@sun.com 496*10105Sadam.leventhal@sun.com /* 497*10105Sadam.leventhal@sun.com * Treat short columns as though they are full of 0s. 498*10105Sadam.leventhal@sun.com * Note that there's therefore nothing needed for P. 499*10105Sadam.leventhal@sun.com */ 500*10105Sadam.leventhal@sun.com for (; i < pcnt; i++, q++, r++) { 501*10105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*q, mask); 502*10105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_4(*r, mask); 503*10105Sadam.leventhal@sun.com } 504*10105Sadam.leventhal@sun.com } 505*10105Sadam.leventhal@sun.com } 506*10105Sadam.leventhal@sun.com } 507*10105Sadam.leventhal@sun.com 508*10105Sadam.leventhal@sun.com /* 509*10105Sadam.leventhal@sun.com * Generate RAID parity in the first virtual columns according to the number of 510*10105Sadam.leventhal@sun.com * parity columns available. 511*10105Sadam.leventhal@sun.com */ 512*10105Sadam.leventhal@sun.com static void 513*10105Sadam.leventhal@sun.com vdev_raidz_generate_parity(raidz_map_t *rm) 514*10105Sadam.leventhal@sun.com { 515*10105Sadam.leventhal@sun.com switch (rm->rm_firstdatacol) { 516*10105Sadam.leventhal@sun.com case 1: 517*10105Sadam.leventhal@sun.com vdev_raidz_generate_parity_p(rm); 518*10105Sadam.leventhal@sun.com break; 519*10105Sadam.leventhal@sun.com case 2: 520*10105Sadam.leventhal@sun.com vdev_raidz_generate_parity_pq(rm); 521*10105Sadam.leventhal@sun.com break; 522*10105Sadam.leventhal@sun.com case 3: 523*10105Sadam.leventhal@sun.com vdev_raidz_generate_parity_pqr(rm); 524*10105Sadam.leventhal@sun.com break; 525*10105Sadam.leventhal@sun.com default: 526*10105Sadam.leventhal@sun.com cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 527*10105Sadam.leventhal@sun.com } 528*10105Sadam.leventhal@sun.com } 529*10105Sadam.leventhal@sun.com 530*10105Sadam.leventhal@sun.com static int 531*10105Sadam.leventhal@sun.com vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 5322082Seschrock { 5332082Seschrock uint64_t *dst, *src, xcount, ccount, count, i; 534*10105Sadam.leventhal@sun.com int x = tgts[0]; 5352082Seschrock int c; 5362082Seschrock 537*10105Sadam.leventhal@sun.com ASSERT(ntgts == 1); 538*10105Sadam.leventhal@sun.com ASSERT(x >= rm->rm_firstdatacol); 539*10105Sadam.leventhal@sun.com ASSERT(x < rm->rm_cols); 540*10105Sadam.leventhal@sun.com 5412082Seschrock xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 5422082Seschrock ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 5432082Seschrock ASSERT(xcount > 0); 5442082Seschrock 5452082Seschrock src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 5462082Seschrock dst = rm->rm_col[x].rc_data; 5472082Seschrock for (i = 0; i < xcount; i++, dst++, src++) { 5482082Seschrock *dst = *src; 5492082Seschrock } 5502082Seschrock 5512082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 552789Sahrens src = rm->rm_col[c].rc_data; 553789Sahrens dst = rm->rm_col[x].rc_data; 5542082Seschrock 5552082Seschrock if (c == x) 5562082Seschrock continue; 5572082Seschrock 5582082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 5592082Seschrock count = MIN(ccount, xcount); 5602082Seschrock 5612082Seschrock for (i = 0; i < count; i++, dst++, src++) { 5622082Seschrock *dst ^= *src; 563789Sahrens } 564789Sahrens } 565*10105Sadam.leventhal@sun.com 566*10105Sadam.leventhal@sun.com return (1 << VDEV_RAIDZ_P); 567789Sahrens } 568789Sahrens 569*10105Sadam.leventhal@sun.com static int 570*10105Sadam.leventhal@sun.com vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 5712082Seschrock { 5722082Seschrock uint64_t *dst, *src, xcount, ccount, count, mask, i; 5732082Seschrock uint8_t *b; 574*10105Sadam.leventhal@sun.com int x = tgts[0]; 5752082Seschrock int c, j, exp; 5762082Seschrock 577*10105Sadam.leventhal@sun.com ASSERT(ntgts == 1); 578*10105Sadam.leventhal@sun.com 5792082Seschrock xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 5802082Seschrock ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 5812082Seschrock 5822082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 5832082Seschrock src = rm->rm_col[c].rc_data; 5842082Seschrock dst = rm->rm_col[x].rc_data; 5852082Seschrock 5862082Seschrock if (c == x) 5872082Seschrock ccount = 0; 5882082Seschrock else 5892082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 5902082Seschrock 5912082Seschrock count = MIN(ccount, xcount); 5922082Seschrock 5932082Seschrock if (c == rm->rm_firstdatacol) { 5942082Seschrock for (i = 0; i < count; i++, dst++, src++) { 5952082Seschrock *dst = *src; 5962082Seschrock } 5972082Seschrock for (; i < xcount; i++, dst++) { 5982082Seschrock *dst = 0; 5992082Seschrock } 6002082Seschrock 6012082Seschrock } else { 6022082Seschrock for (i = 0; i < count; i++, dst++, src++) { 603*10105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*dst, mask); 6042082Seschrock *dst ^= *src; 6052082Seschrock } 6062082Seschrock 6072082Seschrock for (; i < xcount; i++, dst++) { 608*10105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*dst, mask); 6092082Seschrock } 6102082Seschrock } 6112082Seschrock } 6122082Seschrock 6132082Seschrock src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 6142082Seschrock dst = rm->rm_col[x].rc_data; 6152082Seschrock exp = 255 - (rm->rm_cols - 1 - x); 6162082Seschrock 6172082Seschrock for (i = 0; i < xcount; i++, dst++, src++) { 6182082Seschrock *dst ^= *src; 6192082Seschrock for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 6202082Seschrock *b = vdev_raidz_exp2(*b, exp); 6212082Seschrock } 6222082Seschrock } 623*10105Sadam.leventhal@sun.com 624*10105Sadam.leventhal@sun.com return (1 << VDEV_RAIDZ_Q); 6252082Seschrock } 6262082Seschrock 627*10105Sadam.leventhal@sun.com static int 628*10105Sadam.leventhal@sun.com vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 6292082Seschrock { 6302082Seschrock uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 6312082Seschrock void *pdata, *qdata; 6322082Seschrock uint64_t xsize, ysize, i; 633*10105Sadam.leventhal@sun.com int x = tgts[0]; 634*10105Sadam.leventhal@sun.com int y = tgts[1]; 6352082Seschrock 636*10105Sadam.leventhal@sun.com ASSERT(ntgts == 2); 6372082Seschrock ASSERT(x < y); 6382082Seschrock ASSERT(x >= rm->rm_firstdatacol); 6392082Seschrock ASSERT(y < rm->rm_cols); 6402082Seschrock 6412082Seschrock ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 6422082Seschrock 6432082Seschrock /* 6442082Seschrock * Move the parity data aside -- we're going to compute parity as 6452082Seschrock * though columns x and y were full of zeros -- Pxy and Qxy. We want to 6462082Seschrock * reuse the parity generation mechanism without trashing the actual 6472082Seschrock * parity so we make those columns appear to be full of zeros by 6482082Seschrock * setting their lengths to zero. 6492082Seschrock */ 6502082Seschrock pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 6512082Seschrock qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 6522082Seschrock xsize = rm->rm_col[x].rc_size; 6532082Seschrock ysize = rm->rm_col[y].rc_size; 6542082Seschrock 6552082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_data = 6562082Seschrock zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 6572082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_data = 6582082Seschrock zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 6592082Seschrock rm->rm_col[x].rc_size = 0; 6602082Seschrock rm->rm_col[y].rc_size = 0; 6612082Seschrock 6622082Seschrock vdev_raidz_generate_parity_pq(rm); 6632082Seschrock 6642082Seschrock rm->rm_col[x].rc_size = xsize; 6652082Seschrock rm->rm_col[y].rc_size = ysize; 6662082Seschrock 6672082Seschrock p = pdata; 6682082Seschrock q = qdata; 6692082Seschrock pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 6702082Seschrock qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 6712082Seschrock xd = rm->rm_col[x].rc_data; 6722082Seschrock yd = rm->rm_col[y].rc_data; 6732082Seschrock 6742082Seschrock /* 6752082Seschrock * We now have: 6762082Seschrock * Pxy = P + D_x + D_y 6772082Seschrock * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 6782082Seschrock * 6792082Seschrock * We can then solve for D_x: 6802082Seschrock * D_x = A * (P + Pxy) + B * (Q + Qxy) 6812082Seschrock * where 6822082Seschrock * A = 2^(x - y) * (2^(x - y) + 1)^-1 6832082Seschrock * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 6842082Seschrock * 6852082Seschrock * With D_x in hand, we can easily solve for D_y: 6862082Seschrock * D_y = P + Pxy + D_x 6872082Seschrock */ 6882082Seschrock 6892082Seschrock a = vdev_raidz_pow2[255 + x - y]; 6902082Seschrock b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 6912082Seschrock tmp = 255 - vdev_raidz_log2[a ^ 1]; 6922082Seschrock 6932082Seschrock aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 6942082Seschrock bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 6952082Seschrock 6962082Seschrock for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 6972082Seschrock *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 6982082Seschrock vdev_raidz_exp2(*q ^ *qxy, bexp); 6992082Seschrock 7002082Seschrock if (i < ysize) 7012082Seschrock *yd = *p ^ *pxy ^ *xd; 7022082Seschrock } 7032082Seschrock 7042082Seschrock zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 7052082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_size); 7062082Seschrock zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 7072082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_size); 7082082Seschrock 7092082Seschrock /* 7102082Seschrock * Restore the saved parity data. 7112082Seschrock */ 7122082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 7132082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 714*10105Sadam.leventhal@sun.com 715*10105Sadam.leventhal@sun.com return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 716*10105Sadam.leventhal@sun.com } 717*10105Sadam.leventhal@sun.com 718*10105Sadam.leventhal@sun.com /* BEGIN CSTYLED */ 719*10105Sadam.leventhal@sun.com /* 720*10105Sadam.leventhal@sun.com * In the general case of reconstruction, we must solve the system of linear 721*10105Sadam.leventhal@sun.com * equations defined by the coeffecients used to generate parity as well as 722*10105Sadam.leventhal@sun.com * the contents of the data and parity disks. This can be expressed with 723*10105Sadam.leventhal@sun.com * vectors for the original data (D) and the actual data (d) and parity (p) 724*10105Sadam.leventhal@sun.com * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 725*10105Sadam.leventhal@sun.com * 726*10105Sadam.leventhal@sun.com * __ __ __ __ 727*10105Sadam.leventhal@sun.com * | | __ __ | p_0 | 728*10105Sadam.leventhal@sun.com * | V | | D_0 | | p_m-1 | 729*10105Sadam.leventhal@sun.com * | | x | : | = | d_0 | 730*10105Sadam.leventhal@sun.com * | I | | D_n-1 | | : | 731*10105Sadam.leventhal@sun.com * | | ~~ ~~ | d_n-1 | 732*10105Sadam.leventhal@sun.com * ~~ ~~ ~~ ~~ 733*10105Sadam.leventhal@sun.com * 734*10105Sadam.leventhal@sun.com * I is simply a square identity matrix of size n, and V is a vandermonde 735*10105Sadam.leventhal@sun.com * matrix defined by the coeffecients we chose for the various parity columns 736*10105Sadam.leventhal@sun.com * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 737*10105Sadam.leventhal@sun.com * computation as well as linear separability. 738*10105Sadam.leventhal@sun.com * 739*10105Sadam.leventhal@sun.com * __ __ __ __ 740*10105Sadam.leventhal@sun.com * | 1 .. 1 1 1 | | p_0 | 741*10105Sadam.leventhal@sun.com * | 2^n-1 .. 4 2 1 | __ __ | : | 742*10105Sadam.leventhal@sun.com * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 743*10105Sadam.leventhal@sun.com * | 1 .. 0 0 0 | | D_1 | | d_0 | 744*10105Sadam.leventhal@sun.com * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 745*10105Sadam.leventhal@sun.com * | : : : : | | : | | d_2 | 746*10105Sadam.leventhal@sun.com * | 0 .. 1 0 0 | | D_n-1 | | : | 747*10105Sadam.leventhal@sun.com * | 0 .. 0 1 0 | ~~ ~~ | : | 748*10105Sadam.leventhal@sun.com * | 0 .. 0 0 1 | | d_n-1 | 749*10105Sadam.leventhal@sun.com * ~~ ~~ ~~ ~~ 750*10105Sadam.leventhal@sun.com * 751*10105Sadam.leventhal@sun.com * Note that I, V, d, and p are known. To compute D, we must invert the 752*10105Sadam.leventhal@sun.com * matrix and use the known data and parity values to reconstruct the unknown 753*10105Sadam.leventhal@sun.com * data values. We begin by removing the rows in V|I and d|p that correspond 754*10105Sadam.leventhal@sun.com * to failed or missing columns; we then make V|I square (n x n) and d|p 755*10105Sadam.leventhal@sun.com * sized n by removing rows corresponding to unused parity from the bottom up 756*10105Sadam.leventhal@sun.com * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 757*10105Sadam.leventhal@sun.com * using Gauss-Jordan elimination. In the example below we use m=3 parity 758*10105Sadam.leventhal@sun.com * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 759*10105Sadam.leventhal@sun.com * __ __ 760*10105Sadam.leventhal@sun.com * | 1 1 1 1 1 1 1 1 | 761*10105Sadam.leventhal@sun.com * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 762*10105Sadam.leventhal@sun.com * | 19 205 116 29 64 16 4 1 | / / 763*10105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 | / / 764*10105Sadam.leventhal@sun.com * | 0 1 0 0 0 0 0 0 | <--' / 765*10105Sadam.leventhal@sun.com * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 766*10105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 | 767*10105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 | 768*10105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 | 769*10105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 | 770*10105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 | 771*10105Sadam.leventhal@sun.com * ~~ ~~ 772*10105Sadam.leventhal@sun.com * __ __ 773*10105Sadam.leventhal@sun.com * | 1 1 1 1 1 1 1 1 | 774*10105Sadam.leventhal@sun.com * | 128 64 32 16 8 4 2 1 | 775*10105Sadam.leventhal@sun.com * | 19 205 116 29 64 16 4 1 | 776*10105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 | 777*10105Sadam.leventhal@sun.com * | 0 1 0 0 0 0 0 0 | 778*10105Sadam.leventhal@sun.com * (V|I)' = | 0 0 1 0 0 0 0 0 | 779*10105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 | 780*10105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 | 781*10105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 | 782*10105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 | 783*10105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 | 784*10105Sadam.leventhal@sun.com * ~~ ~~ 785*10105Sadam.leventhal@sun.com * 786*10105Sadam.leventhal@sun.com * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 787*10105Sadam.leventhal@sun.com * have carefully chosen the seed values 1, 2, and 4 to ensure that this 788*10105Sadam.leventhal@sun.com * matrix is not singular. 789*10105Sadam.leventhal@sun.com * __ __ 790*10105Sadam.leventhal@sun.com * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 791*10105Sadam.leventhal@sun.com * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 792*10105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 793*10105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 794*10105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 795*10105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 796*10105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 797*10105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 798*10105Sadam.leventhal@sun.com * ~~ ~~ 799*10105Sadam.leventhal@sun.com * __ __ 800*10105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 801*10105Sadam.leventhal@sun.com * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 802*10105Sadam.leventhal@sun.com * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 803*10105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 804*10105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 805*10105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 806*10105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 807*10105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 808*10105Sadam.leventhal@sun.com * ~~ ~~ 809*10105Sadam.leventhal@sun.com * __ __ 810*10105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 811*10105Sadam.leventhal@sun.com * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 812*10105Sadam.leventhal@sun.com * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 813*10105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 814*10105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 815*10105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 816*10105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 817*10105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 818*10105Sadam.leventhal@sun.com * ~~ ~~ 819*10105Sadam.leventhal@sun.com * __ __ 820*10105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 821*10105Sadam.leventhal@sun.com * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 822*10105Sadam.leventhal@sun.com * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 823*10105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 824*10105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 825*10105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 826*10105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 827*10105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 828*10105Sadam.leventhal@sun.com * ~~ ~~ 829*10105Sadam.leventhal@sun.com * __ __ 830*10105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 831*10105Sadam.leventhal@sun.com * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 832*10105Sadam.leventhal@sun.com * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 833*10105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 834*10105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 835*10105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 836*10105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 837*10105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 838*10105Sadam.leventhal@sun.com * ~~ ~~ 839*10105Sadam.leventhal@sun.com * __ __ 840*10105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 841*10105Sadam.leventhal@sun.com * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 842*10105Sadam.leventhal@sun.com * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 843*10105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 844*10105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 845*10105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 846*10105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 847*10105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 848*10105Sadam.leventhal@sun.com * ~~ ~~ 849*10105Sadam.leventhal@sun.com * __ __ 850*10105Sadam.leventhal@sun.com * | 0 0 1 0 0 0 0 0 | 851*10105Sadam.leventhal@sun.com * | 167 100 5 41 159 169 217 208 | 852*10105Sadam.leventhal@sun.com * | 166 100 4 40 158 168 216 209 | 853*10105Sadam.leventhal@sun.com * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 854*10105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 | 855*10105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 | 856*10105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 | 857*10105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 | 858*10105Sadam.leventhal@sun.com * ~~ ~~ 859*10105Sadam.leventhal@sun.com * 860*10105Sadam.leventhal@sun.com * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 861*10105Sadam.leventhal@sun.com * of the missing data. 862*10105Sadam.leventhal@sun.com * 863*10105Sadam.leventhal@sun.com * As is apparent from the example above, the only non-trivial rows in the 864*10105Sadam.leventhal@sun.com * inverse matrix correspond to the data disks that we're trying to 865*10105Sadam.leventhal@sun.com * reconstruct. Indeed, those are the only rows we need as the others would 866*10105Sadam.leventhal@sun.com * only be useful for reconstructing data known or assumed to be valid. For 867*10105Sadam.leventhal@sun.com * that reason, we only build the coefficients in the rows that correspond to 868*10105Sadam.leventhal@sun.com * targeted columns. 869*10105Sadam.leventhal@sun.com */ 870*10105Sadam.leventhal@sun.com /* END CSTYLED */ 871*10105Sadam.leventhal@sun.com 872*10105Sadam.leventhal@sun.com static void 873*10105Sadam.leventhal@sun.com vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 874*10105Sadam.leventhal@sun.com uint8_t **rows) 875*10105Sadam.leventhal@sun.com { 876*10105Sadam.leventhal@sun.com int i, j; 877*10105Sadam.leventhal@sun.com int pow; 878*10105Sadam.leventhal@sun.com 879*10105Sadam.leventhal@sun.com ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 880*10105Sadam.leventhal@sun.com 881*10105Sadam.leventhal@sun.com /* 882*10105Sadam.leventhal@sun.com * Fill in the missing rows of interest. 883*10105Sadam.leventhal@sun.com */ 884*10105Sadam.leventhal@sun.com for (i = 0; i < nmap; i++) { 885*10105Sadam.leventhal@sun.com ASSERT3S(0, <=, map[i]); 886*10105Sadam.leventhal@sun.com ASSERT3S(map[i], <=, 2); 887*10105Sadam.leventhal@sun.com 888*10105Sadam.leventhal@sun.com pow = map[i] * n; 889*10105Sadam.leventhal@sun.com if (pow > 255) 890*10105Sadam.leventhal@sun.com pow -= 255; 891*10105Sadam.leventhal@sun.com ASSERT(pow <= 255); 892*10105Sadam.leventhal@sun.com 893*10105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 894*10105Sadam.leventhal@sun.com pow -= map[i]; 895*10105Sadam.leventhal@sun.com if (pow < 0) 896*10105Sadam.leventhal@sun.com pow += 255; 897*10105Sadam.leventhal@sun.com rows[i][j] = vdev_raidz_pow2[pow]; 898*10105Sadam.leventhal@sun.com } 899*10105Sadam.leventhal@sun.com } 9002082Seschrock } 9012082Seschrock 902*10105Sadam.leventhal@sun.com static void 903*10105Sadam.leventhal@sun.com vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 904*10105Sadam.leventhal@sun.com uint8_t **rows, uint8_t **invrows, const uint8_t *used) 905*10105Sadam.leventhal@sun.com { 906*10105Sadam.leventhal@sun.com int i, j, ii, jj; 907*10105Sadam.leventhal@sun.com uint8_t log; 908*10105Sadam.leventhal@sun.com 909*10105Sadam.leventhal@sun.com /* 910*10105Sadam.leventhal@sun.com * Assert that the first nmissing entries from the array of used 911*10105Sadam.leventhal@sun.com * columns correspond to parity columns and that subsequent entries 912*10105Sadam.leventhal@sun.com * correspond to data columns. 913*10105Sadam.leventhal@sun.com */ 914*10105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 915*10105Sadam.leventhal@sun.com ASSERT3S(used[i], <, rm->rm_firstdatacol); 916*10105Sadam.leventhal@sun.com } 917*10105Sadam.leventhal@sun.com for (; i < n; i++) { 918*10105Sadam.leventhal@sun.com ASSERT3S(used[i], >=, rm->rm_firstdatacol); 919*10105Sadam.leventhal@sun.com } 920*10105Sadam.leventhal@sun.com 921*10105Sadam.leventhal@sun.com /* 922*10105Sadam.leventhal@sun.com * First initialize the storage where we'll compute the inverse rows. 923*10105Sadam.leventhal@sun.com */ 924*10105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 925*10105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 926*10105Sadam.leventhal@sun.com invrows[i][j] = (i == j) ? 1 : 0; 927*10105Sadam.leventhal@sun.com } 928*10105Sadam.leventhal@sun.com } 929*10105Sadam.leventhal@sun.com 930*10105Sadam.leventhal@sun.com /* 931*10105Sadam.leventhal@sun.com * Subtract all trivial rows from the rows of consequence. 932*10105Sadam.leventhal@sun.com */ 933*10105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 934*10105Sadam.leventhal@sun.com for (j = nmissing; j < n; j++) { 935*10105Sadam.leventhal@sun.com ASSERT3U(used[j], >=, rm->rm_firstdatacol); 936*10105Sadam.leventhal@sun.com jj = used[j] - rm->rm_firstdatacol; 937*10105Sadam.leventhal@sun.com ASSERT3S(jj, <, n); 938*10105Sadam.leventhal@sun.com invrows[i][j] = rows[i][jj]; 939*10105Sadam.leventhal@sun.com rows[i][jj] = 0; 940*10105Sadam.leventhal@sun.com } 941*10105Sadam.leventhal@sun.com } 942*10105Sadam.leventhal@sun.com 943*10105Sadam.leventhal@sun.com /* 944*10105Sadam.leventhal@sun.com * For each of the rows of interest, we must normalize it and subtract 945*10105Sadam.leventhal@sun.com * a multiple of it from the other rows. 946*10105Sadam.leventhal@sun.com */ 947*10105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 948*10105Sadam.leventhal@sun.com for (j = 0; j < missing[i]; j++) { 949*10105Sadam.leventhal@sun.com ASSERT3U(rows[i][j], ==, 0); 950*10105Sadam.leventhal@sun.com } 951*10105Sadam.leventhal@sun.com ASSERT3U(rows[i][missing[i]], !=, 0); 952*10105Sadam.leventhal@sun.com 953*10105Sadam.leventhal@sun.com /* 954*10105Sadam.leventhal@sun.com * Compute the inverse of the first element and multiply each 955*10105Sadam.leventhal@sun.com * element in the row by that value. 956*10105Sadam.leventhal@sun.com */ 957*10105Sadam.leventhal@sun.com log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 958*10105Sadam.leventhal@sun.com 959*10105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 960*10105Sadam.leventhal@sun.com rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 961*10105Sadam.leventhal@sun.com invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 962*10105Sadam.leventhal@sun.com } 963*10105Sadam.leventhal@sun.com 964*10105Sadam.leventhal@sun.com for (ii = 0; ii < nmissing; ii++) { 965*10105Sadam.leventhal@sun.com if (i == ii) 966*10105Sadam.leventhal@sun.com continue; 967*10105Sadam.leventhal@sun.com 968*10105Sadam.leventhal@sun.com ASSERT3U(rows[ii][missing[i]], !=, 0); 969*10105Sadam.leventhal@sun.com 970*10105Sadam.leventhal@sun.com log = vdev_raidz_log2[rows[ii][missing[i]]]; 971*10105Sadam.leventhal@sun.com 972*10105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 973*10105Sadam.leventhal@sun.com rows[ii][j] ^= 974*10105Sadam.leventhal@sun.com vdev_raidz_exp2(rows[i][j], log); 975*10105Sadam.leventhal@sun.com invrows[ii][j] ^= 976*10105Sadam.leventhal@sun.com vdev_raidz_exp2(invrows[i][j], log); 977*10105Sadam.leventhal@sun.com } 978*10105Sadam.leventhal@sun.com } 979*10105Sadam.leventhal@sun.com } 980*10105Sadam.leventhal@sun.com 981*10105Sadam.leventhal@sun.com /* 982*10105Sadam.leventhal@sun.com * Verify that the data that is left in the rows are properly part of 983*10105Sadam.leventhal@sun.com * an identity matrix. 984*10105Sadam.leventhal@sun.com */ 985*10105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 986*10105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 987*10105Sadam.leventhal@sun.com if (j == missing[i]) { 988*10105Sadam.leventhal@sun.com ASSERT3U(rows[i][j], ==, 1); 989*10105Sadam.leventhal@sun.com } else { 990*10105Sadam.leventhal@sun.com ASSERT3U(rows[i][j], ==, 0); 991*10105Sadam.leventhal@sun.com } 992*10105Sadam.leventhal@sun.com } 993*10105Sadam.leventhal@sun.com } 994*10105Sadam.leventhal@sun.com } 995*10105Sadam.leventhal@sun.com 996*10105Sadam.leventhal@sun.com static void 997*10105Sadam.leventhal@sun.com vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 998*10105Sadam.leventhal@sun.com int *missing, uint8_t **invrows, const uint8_t *used) 999*10105Sadam.leventhal@sun.com { 1000*10105Sadam.leventhal@sun.com int i, j, x, cc, c; 1001*10105Sadam.leventhal@sun.com uint8_t *src; 1002*10105Sadam.leventhal@sun.com uint64_t ccount; 1003*10105Sadam.leventhal@sun.com uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 1004*10105Sadam.leventhal@sun.com uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 1005*10105Sadam.leventhal@sun.com uint8_t log, val; 1006*10105Sadam.leventhal@sun.com int ll; 1007*10105Sadam.leventhal@sun.com uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1008*10105Sadam.leventhal@sun.com uint8_t *p, *pp; 1009*10105Sadam.leventhal@sun.com size_t psize; 1010*10105Sadam.leventhal@sun.com 1011*10105Sadam.leventhal@sun.com psize = sizeof (invlog[0][0]) * n * nmissing; 1012*10105Sadam.leventhal@sun.com p = kmem_alloc(psize, KM_SLEEP); 1013*10105Sadam.leventhal@sun.com 1014*10105Sadam.leventhal@sun.com for (pp = p, i = 0; i < nmissing; i++) { 1015*10105Sadam.leventhal@sun.com invlog[i] = pp; 1016*10105Sadam.leventhal@sun.com pp += n; 1017*10105Sadam.leventhal@sun.com } 1018*10105Sadam.leventhal@sun.com 1019*10105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 1020*10105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 1021*10105Sadam.leventhal@sun.com ASSERT3U(invrows[i][j], !=, 0); 1022*10105Sadam.leventhal@sun.com invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1023*10105Sadam.leventhal@sun.com } 1024*10105Sadam.leventhal@sun.com } 1025*10105Sadam.leventhal@sun.com 1026*10105Sadam.leventhal@sun.com for (i = 0; i < n; i++) { 1027*10105Sadam.leventhal@sun.com c = used[i]; 1028*10105Sadam.leventhal@sun.com ASSERT3U(c, <, rm->rm_cols); 1029*10105Sadam.leventhal@sun.com 1030*10105Sadam.leventhal@sun.com src = rm->rm_col[c].rc_data; 1031*10105Sadam.leventhal@sun.com ccount = rm->rm_col[c].rc_size; 1032*10105Sadam.leventhal@sun.com for (j = 0; j < nmissing; j++) { 1033*10105Sadam.leventhal@sun.com cc = missing[j] + rm->rm_firstdatacol; 1034*10105Sadam.leventhal@sun.com ASSERT3U(cc, >=, rm->rm_firstdatacol); 1035*10105Sadam.leventhal@sun.com ASSERT3U(cc, <, rm->rm_cols); 1036*10105Sadam.leventhal@sun.com ASSERT3U(cc, !=, c); 1037*10105Sadam.leventhal@sun.com 1038*10105Sadam.leventhal@sun.com dst[j] = rm->rm_col[cc].rc_data; 1039*10105Sadam.leventhal@sun.com dcount[j] = rm->rm_col[cc].rc_size; 1040*10105Sadam.leventhal@sun.com } 1041*10105Sadam.leventhal@sun.com 1042*10105Sadam.leventhal@sun.com ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1043*10105Sadam.leventhal@sun.com 1044*10105Sadam.leventhal@sun.com for (x = 0; x < ccount; x++, src++) { 1045*10105Sadam.leventhal@sun.com if (*src != 0) 1046*10105Sadam.leventhal@sun.com log = vdev_raidz_log2[*src]; 1047*10105Sadam.leventhal@sun.com 1048*10105Sadam.leventhal@sun.com for (cc = 0; cc < nmissing; cc++) { 1049*10105Sadam.leventhal@sun.com if (x >= dcount[cc]) 1050*10105Sadam.leventhal@sun.com continue; 1051*10105Sadam.leventhal@sun.com 1052*10105Sadam.leventhal@sun.com if (*src == 0) { 1053*10105Sadam.leventhal@sun.com val = 0; 1054*10105Sadam.leventhal@sun.com } else { 1055*10105Sadam.leventhal@sun.com if ((ll = log + invlog[cc][i]) >= 255) 1056*10105Sadam.leventhal@sun.com ll -= 255; 1057*10105Sadam.leventhal@sun.com val = vdev_raidz_pow2[ll]; 1058*10105Sadam.leventhal@sun.com } 1059*10105Sadam.leventhal@sun.com 1060*10105Sadam.leventhal@sun.com if (i == 0) 1061*10105Sadam.leventhal@sun.com dst[cc][x] = val; 1062*10105Sadam.leventhal@sun.com else 1063*10105Sadam.leventhal@sun.com dst[cc][x] ^= val; 1064*10105Sadam.leventhal@sun.com } 1065*10105Sadam.leventhal@sun.com } 1066*10105Sadam.leventhal@sun.com } 1067*10105Sadam.leventhal@sun.com 1068*10105Sadam.leventhal@sun.com kmem_free(p, psize); 1069*10105Sadam.leventhal@sun.com } 1070*10105Sadam.leventhal@sun.com 1071*10105Sadam.leventhal@sun.com static int 1072*10105Sadam.leventhal@sun.com vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1073*10105Sadam.leventhal@sun.com { 1074*10105Sadam.leventhal@sun.com int n, i, c, t, tt; 1075*10105Sadam.leventhal@sun.com int nmissing_rows; 1076*10105Sadam.leventhal@sun.com int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1077*10105Sadam.leventhal@sun.com int parity_map[VDEV_RAIDZ_MAXPARITY]; 1078*10105Sadam.leventhal@sun.com 1079*10105Sadam.leventhal@sun.com uint8_t *p, *pp; 1080*10105Sadam.leventhal@sun.com size_t psize; 1081*10105Sadam.leventhal@sun.com 1082*10105Sadam.leventhal@sun.com uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1083*10105Sadam.leventhal@sun.com uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1084*10105Sadam.leventhal@sun.com uint8_t *used; 1085*10105Sadam.leventhal@sun.com 1086*10105Sadam.leventhal@sun.com int code = 0; 1087*10105Sadam.leventhal@sun.com 1088*10105Sadam.leventhal@sun.com 1089*10105Sadam.leventhal@sun.com n = rm->rm_cols - rm->rm_firstdatacol; 1090*10105Sadam.leventhal@sun.com 1091*10105Sadam.leventhal@sun.com /* 1092*10105Sadam.leventhal@sun.com * Figure out which data columns are missing. 1093*10105Sadam.leventhal@sun.com */ 1094*10105Sadam.leventhal@sun.com nmissing_rows = 0; 1095*10105Sadam.leventhal@sun.com for (t = 0; t < ntgts; t++) { 1096*10105Sadam.leventhal@sun.com if (tgts[t] >= rm->rm_firstdatacol) { 1097*10105Sadam.leventhal@sun.com missing_rows[nmissing_rows++] = 1098*10105Sadam.leventhal@sun.com tgts[t] - rm->rm_firstdatacol; 1099*10105Sadam.leventhal@sun.com } 1100*10105Sadam.leventhal@sun.com } 1101*10105Sadam.leventhal@sun.com 1102*10105Sadam.leventhal@sun.com /* 1103*10105Sadam.leventhal@sun.com * Figure out which parity columns to use to help generate the missing 1104*10105Sadam.leventhal@sun.com * data columns. 1105*10105Sadam.leventhal@sun.com */ 1106*10105Sadam.leventhal@sun.com for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1107*10105Sadam.leventhal@sun.com ASSERT(tt < ntgts); 1108*10105Sadam.leventhal@sun.com ASSERT(c < rm->rm_firstdatacol); 1109*10105Sadam.leventhal@sun.com 1110*10105Sadam.leventhal@sun.com /* 1111*10105Sadam.leventhal@sun.com * Skip any targeted parity columns. 1112*10105Sadam.leventhal@sun.com */ 1113*10105Sadam.leventhal@sun.com if (c == tgts[tt]) { 1114*10105Sadam.leventhal@sun.com tt++; 1115*10105Sadam.leventhal@sun.com continue; 1116*10105Sadam.leventhal@sun.com } 1117*10105Sadam.leventhal@sun.com 1118*10105Sadam.leventhal@sun.com code |= 1 << c; 1119*10105Sadam.leventhal@sun.com 1120*10105Sadam.leventhal@sun.com parity_map[i] = c; 1121*10105Sadam.leventhal@sun.com i++; 1122*10105Sadam.leventhal@sun.com } 1123*10105Sadam.leventhal@sun.com 1124*10105Sadam.leventhal@sun.com ASSERT(code != 0); 1125*10105Sadam.leventhal@sun.com ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1126*10105Sadam.leventhal@sun.com 1127*10105Sadam.leventhal@sun.com psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1128*10105Sadam.leventhal@sun.com nmissing_rows * n + sizeof (used[0]) * n; 1129*10105Sadam.leventhal@sun.com p = kmem_alloc(psize, KM_SLEEP); 1130*10105Sadam.leventhal@sun.com 1131*10105Sadam.leventhal@sun.com for (pp = p, i = 0; i < nmissing_rows; i++) { 1132*10105Sadam.leventhal@sun.com rows[i] = pp; 1133*10105Sadam.leventhal@sun.com pp += n; 1134*10105Sadam.leventhal@sun.com invrows[i] = pp; 1135*10105Sadam.leventhal@sun.com pp += n; 1136*10105Sadam.leventhal@sun.com } 1137*10105Sadam.leventhal@sun.com used = pp; 1138*10105Sadam.leventhal@sun.com 1139*10105Sadam.leventhal@sun.com for (i = 0; i < nmissing_rows; i++) { 1140*10105Sadam.leventhal@sun.com used[i] = parity_map[i]; 1141*10105Sadam.leventhal@sun.com } 1142*10105Sadam.leventhal@sun.com 1143*10105Sadam.leventhal@sun.com for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1144*10105Sadam.leventhal@sun.com if (tt < nmissing_rows && 1145*10105Sadam.leventhal@sun.com c == missing_rows[tt] + rm->rm_firstdatacol) { 1146*10105Sadam.leventhal@sun.com tt++; 1147*10105Sadam.leventhal@sun.com continue; 1148*10105Sadam.leventhal@sun.com } 1149*10105Sadam.leventhal@sun.com 1150*10105Sadam.leventhal@sun.com ASSERT3S(i, <, n); 1151*10105Sadam.leventhal@sun.com used[i] = c; 1152*10105Sadam.leventhal@sun.com i++; 1153*10105Sadam.leventhal@sun.com } 1154*10105Sadam.leventhal@sun.com 1155*10105Sadam.leventhal@sun.com /* 1156*10105Sadam.leventhal@sun.com * Initialize the interesting rows of the matrix. 1157*10105Sadam.leventhal@sun.com */ 1158*10105Sadam.leventhal@sun.com vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1159*10105Sadam.leventhal@sun.com 1160*10105Sadam.leventhal@sun.com /* 1161*10105Sadam.leventhal@sun.com * Invert the matrix. 1162*10105Sadam.leventhal@sun.com */ 1163*10105Sadam.leventhal@sun.com vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1164*10105Sadam.leventhal@sun.com invrows, used); 1165*10105Sadam.leventhal@sun.com 1166*10105Sadam.leventhal@sun.com /* 1167*10105Sadam.leventhal@sun.com * Reconstruct the missing data using the generated matrix. 1168*10105Sadam.leventhal@sun.com */ 1169*10105Sadam.leventhal@sun.com vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1170*10105Sadam.leventhal@sun.com invrows, used); 1171*10105Sadam.leventhal@sun.com 1172*10105Sadam.leventhal@sun.com kmem_free(p, psize); 1173*10105Sadam.leventhal@sun.com 1174*10105Sadam.leventhal@sun.com return (code); 1175*10105Sadam.leventhal@sun.com } 1176*10105Sadam.leventhal@sun.com 1177*10105Sadam.leventhal@sun.com static int 1178*10105Sadam.leventhal@sun.com vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1179*10105Sadam.leventhal@sun.com { 1180*10105Sadam.leventhal@sun.com int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1181*10105Sadam.leventhal@sun.com int ntgts; 1182*10105Sadam.leventhal@sun.com int i, c; 1183*10105Sadam.leventhal@sun.com int code; 1184*10105Sadam.leventhal@sun.com int nbadparity, nbaddata; 1185*10105Sadam.leventhal@sun.com int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1186*10105Sadam.leventhal@sun.com 1187*10105Sadam.leventhal@sun.com /* 1188*10105Sadam.leventhal@sun.com * The tgts list must already be sorted. 1189*10105Sadam.leventhal@sun.com */ 1190*10105Sadam.leventhal@sun.com for (i = 1; i < nt; i++) { 1191*10105Sadam.leventhal@sun.com ASSERT(t[i] > t[i - 1]); 1192*10105Sadam.leventhal@sun.com } 1193*10105Sadam.leventhal@sun.com 1194*10105Sadam.leventhal@sun.com nbadparity = rm->rm_firstdatacol; 1195*10105Sadam.leventhal@sun.com nbaddata = rm->rm_cols - nbadparity; 1196*10105Sadam.leventhal@sun.com ntgts = 0; 1197*10105Sadam.leventhal@sun.com for (i = 0, c = 0; c < rm->rm_cols; c++) { 1198*10105Sadam.leventhal@sun.com if (c < rm->rm_firstdatacol) 1199*10105Sadam.leventhal@sun.com parity_valid[c] = B_FALSE; 1200*10105Sadam.leventhal@sun.com 1201*10105Sadam.leventhal@sun.com if (i < nt && c == t[i]) { 1202*10105Sadam.leventhal@sun.com tgts[ntgts++] = c; 1203*10105Sadam.leventhal@sun.com i++; 1204*10105Sadam.leventhal@sun.com } else if (rm->rm_col[c].rc_error != 0) { 1205*10105Sadam.leventhal@sun.com tgts[ntgts++] = c; 1206*10105Sadam.leventhal@sun.com } else if (c >= rm->rm_firstdatacol) { 1207*10105Sadam.leventhal@sun.com nbaddata--; 1208*10105Sadam.leventhal@sun.com } else { 1209*10105Sadam.leventhal@sun.com parity_valid[c] = B_TRUE; 1210*10105Sadam.leventhal@sun.com nbadparity--; 1211*10105Sadam.leventhal@sun.com } 1212*10105Sadam.leventhal@sun.com } 1213*10105Sadam.leventhal@sun.com 1214*10105Sadam.leventhal@sun.com ASSERT(ntgts >= nt); 1215*10105Sadam.leventhal@sun.com ASSERT(nbaddata >= 0); 1216*10105Sadam.leventhal@sun.com ASSERT(nbaddata + nbadparity == ntgts); 1217*10105Sadam.leventhal@sun.com 1218*10105Sadam.leventhal@sun.com dt = &tgts[nbadparity]; 1219*10105Sadam.leventhal@sun.com 1220*10105Sadam.leventhal@sun.com /* 1221*10105Sadam.leventhal@sun.com * See if we can use any of our optimized reconstruction routines. 1222*10105Sadam.leventhal@sun.com */ 1223*10105Sadam.leventhal@sun.com if (!vdev_raidz_default_to_general) { 1224*10105Sadam.leventhal@sun.com switch (nbaddata) { 1225*10105Sadam.leventhal@sun.com case 1: 1226*10105Sadam.leventhal@sun.com if (parity_valid[VDEV_RAIDZ_P]) 1227*10105Sadam.leventhal@sun.com return (vdev_raidz_reconstruct_p(rm, dt, 1)); 1228*10105Sadam.leventhal@sun.com 1229*10105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol > 1); 1230*10105Sadam.leventhal@sun.com 1231*10105Sadam.leventhal@sun.com if (parity_valid[VDEV_RAIDZ_Q]) 1232*10105Sadam.leventhal@sun.com return (vdev_raidz_reconstruct_q(rm, dt, 1)); 1233*10105Sadam.leventhal@sun.com 1234*10105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol > 2); 1235*10105Sadam.leventhal@sun.com break; 1236*10105Sadam.leventhal@sun.com 1237*10105Sadam.leventhal@sun.com case 2: 1238*10105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol > 1); 1239*10105Sadam.leventhal@sun.com 1240*10105Sadam.leventhal@sun.com if (parity_valid[VDEV_RAIDZ_P] && 1241*10105Sadam.leventhal@sun.com parity_valid[VDEV_RAIDZ_Q]) 1242*10105Sadam.leventhal@sun.com return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 1243*10105Sadam.leventhal@sun.com 1244*10105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol > 2); 1245*10105Sadam.leventhal@sun.com 1246*10105Sadam.leventhal@sun.com break; 1247*10105Sadam.leventhal@sun.com } 1248*10105Sadam.leventhal@sun.com } 1249*10105Sadam.leventhal@sun.com 1250*10105Sadam.leventhal@sun.com code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1251*10105Sadam.leventhal@sun.com ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1252*10105Sadam.leventhal@sun.com ASSERT(code > 0); 1253*10105Sadam.leventhal@sun.com return (code); 1254*10105Sadam.leventhal@sun.com } 12552082Seschrock 1256789Sahrens static int 1257789Sahrens vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) 1258789Sahrens { 1259*10105Sadam.leventhal@sun.com vdev_t *cvd; 12602082Seschrock uint64_t nparity = vd->vdev_nparity; 1261*10105Sadam.leventhal@sun.com int c; 1262789Sahrens int lasterror = 0; 1263789Sahrens int numerrors = 0; 1264789Sahrens 12652082Seschrock ASSERT(nparity > 0); 12662082Seschrock 12672082Seschrock if (nparity > VDEV_RAIDZ_MAXPARITY || 12682082Seschrock vd->vdev_children < nparity + 1) { 1269789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1270789Sahrens return (EINVAL); 1271789Sahrens } 1272789Sahrens 12739846SEric.Taylor@Sun.COM vdev_open_children(vd); 1274789Sahrens 1275*10105Sadam.leventhal@sun.com for (c = 0; c < vd->vdev_children; c++) { 1276*10105Sadam.leventhal@sun.com cvd = vd->vdev_child[c]; 12779846SEric.Taylor@Sun.COM 1278*10105Sadam.leventhal@sun.com if (cvd->vdev_open_error != 0) { 12799846SEric.Taylor@Sun.COM lasterror = cvd->vdev_open_error; 1280789Sahrens numerrors++; 1281789Sahrens continue; 1282789Sahrens } 1283789Sahrens 1284789Sahrens *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 12851732Sbonwick *ashift = MAX(*ashift, cvd->vdev_ashift); 1286789Sahrens } 1287789Sahrens 1288789Sahrens *asize *= vd->vdev_children; 1289789Sahrens 12902082Seschrock if (numerrors > nparity) { 1291789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1292789Sahrens return (lasterror); 1293789Sahrens } 1294789Sahrens 1295789Sahrens return (0); 1296789Sahrens } 1297789Sahrens 1298789Sahrens static void 1299789Sahrens vdev_raidz_close(vdev_t *vd) 1300789Sahrens { 1301*10105Sadam.leventhal@sun.com int c; 1302*10105Sadam.leventhal@sun.com 1303*10105Sadam.leventhal@sun.com for (c = 0; c < vd->vdev_children; c++) 1304789Sahrens vdev_close(vd->vdev_child[c]); 1305789Sahrens } 1306789Sahrens 1307789Sahrens static uint64_t 1308789Sahrens vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1309789Sahrens { 1310789Sahrens uint64_t asize; 13111732Sbonwick uint64_t ashift = vd->vdev_top->vdev_ashift; 1312789Sahrens uint64_t cols = vd->vdev_children; 13132082Seschrock uint64_t nparity = vd->vdev_nparity; 1314789Sahrens 13151732Sbonwick asize = ((psize - 1) >> ashift) + 1; 13162082Seschrock asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 13172082Seschrock asize = roundup(asize, nparity + 1) << ashift; 1318789Sahrens 1319789Sahrens return (asize); 1320789Sahrens } 1321789Sahrens 1322789Sahrens static void 1323789Sahrens vdev_raidz_child_done(zio_t *zio) 1324789Sahrens { 1325789Sahrens raidz_col_t *rc = zio->io_private; 1326789Sahrens 1327789Sahrens rc->rc_error = zio->io_error; 1328789Sahrens rc->rc_tried = 1; 1329789Sahrens rc->rc_skipped = 0; 1330789Sahrens } 1331789Sahrens 13325530Sbonwick static int 1333789Sahrens vdev_raidz_io_start(zio_t *zio) 1334789Sahrens { 1335789Sahrens vdev_t *vd = zio->io_vd; 13361732Sbonwick vdev_t *tvd = vd->vdev_top; 1337789Sahrens vdev_t *cvd; 1338789Sahrens blkptr_t *bp = zio->io_bp; 1339789Sahrens raidz_map_t *rm; 1340789Sahrens raidz_col_t *rc; 1341*10105Sadam.leventhal@sun.com int c, i; 1342789Sahrens 13432082Seschrock rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, 13442082Seschrock vd->vdev_nparity); 1345789Sahrens 13461775Sbillm ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1347789Sahrens 1348789Sahrens if (zio->io_type == ZIO_TYPE_WRITE) { 1349*10105Sadam.leventhal@sun.com vdev_raidz_generate_parity(rm); 1350789Sahrens 1351789Sahrens for (c = 0; c < rm->rm_cols; c++) { 1352789Sahrens rc = &rm->rm_col[c]; 13532082Seschrock cvd = vd->vdev_child[rc->rc_devidx]; 1354789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1355789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 13567754SJeff.Bonwick@Sun.COM zio->io_type, zio->io_priority, 0, 1357789Sahrens vdev_raidz_child_done, rc)); 1358789Sahrens } 13595530Sbonwick 1360*10105Sadam.leventhal@sun.com /* 1361*10105Sadam.leventhal@sun.com * Generate optional I/Os for any skipped sectors to improve 1362*10105Sadam.leventhal@sun.com * aggregation contiguity. 1363*10105Sadam.leventhal@sun.com */ 1364*10105Sadam.leventhal@sun.com for (c = rm->rm_bigcols, i = 0; i < rm->rm_skipped; c++, i++) { 1365*10105Sadam.leventhal@sun.com ASSERT(c <= rm->rm_scols); 1366*10105Sadam.leventhal@sun.com if (c == rm->rm_scols) 1367*10105Sadam.leventhal@sun.com c = 0; 1368*10105Sadam.leventhal@sun.com rc = &rm->rm_col[c]; 1369*10105Sadam.leventhal@sun.com cvd = vd->vdev_child[rc->rc_devidx]; 1370*10105Sadam.leventhal@sun.com zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1371*10105Sadam.leventhal@sun.com rc->rc_offset + rc->rc_size, NULL, 1372*10105Sadam.leventhal@sun.com 1 << tvd->vdev_ashift, 1373*10105Sadam.leventhal@sun.com zio->io_type, zio->io_priority, 1374*10105Sadam.leventhal@sun.com ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 1375*10105Sadam.leventhal@sun.com } 1376*10105Sadam.leventhal@sun.com 13777754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 1378789Sahrens } 1379789Sahrens 1380789Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ); 1381789Sahrens 13822082Seschrock /* 13832082Seschrock * Iterate over the columns in reverse order so that we hit the parity 1384*10105Sadam.leventhal@sun.com * last -- any errors along the way will force us to read the parity. 13852082Seschrock */ 1386789Sahrens for (c = rm->rm_cols - 1; c >= 0; c--) { 1387789Sahrens rc = &rm->rm_col[c]; 13882082Seschrock cvd = vd->vdev_child[rc->rc_devidx]; 13895329Sgw25295 if (!vdev_readable(cvd)) { 13902082Seschrock if (c >= rm->rm_firstdatacol) 13912082Seschrock rm->rm_missingdata++; 13922082Seschrock else 13932082Seschrock rm->rm_missingparity++; 1394789Sahrens rc->rc_error = ENXIO; 1395789Sahrens rc->rc_tried = 1; /* don't even try */ 1396789Sahrens rc->rc_skipped = 1; 1397789Sahrens continue; 1398789Sahrens } 13998241SJeff.Bonwick@Sun.COM if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) { 14002082Seschrock if (c >= rm->rm_firstdatacol) 14012082Seschrock rm->rm_missingdata++; 14022082Seschrock else 14032082Seschrock rm->rm_missingparity++; 1404789Sahrens rc->rc_error = ESTALE; 1405789Sahrens rc->rc_skipped = 1; 1406789Sahrens continue; 1407789Sahrens } 14082082Seschrock if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 14099434SMark.Musante@Sun.COM (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1410789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1411789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 14127754SJeff.Bonwick@Sun.COM zio->io_type, zio->io_priority, 0, 1413789Sahrens vdev_raidz_child_done, rc)); 1414789Sahrens } 1415789Sahrens } 1416789Sahrens 14177754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 1418789Sahrens } 1419789Sahrens 14201544Seschrock /* 14211544Seschrock * Report a checksum error for a child of a RAID-Z device. 14221544Seschrock */ 14231544Seschrock static void 14241544Seschrock raidz_checksum_error(zio_t *zio, raidz_col_t *rc) 14251544Seschrock { 14262082Seschrock vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 14271544Seschrock 14281544Seschrock if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 14291544Seschrock mutex_enter(&vd->vdev_stat_lock); 14301544Seschrock vd->vdev_stat.vs_checksum_errors++; 14311544Seschrock mutex_exit(&vd->vdev_stat_lock); 14321544Seschrock } 14331544Seschrock 14341544Seschrock if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 14351544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 14361544Seschrock zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size); 14371544Seschrock } 14381544Seschrock 14392082Seschrock /* 14402082Seschrock * Generate the parity from the data columns. If we tried and were able to 14412082Seschrock * read the parity without error, verify that the generated parity matches the 14422082Seschrock * data we read. If it doesn't, we fire off a checksum error. Return the 14432082Seschrock * number such failures. 14442082Seschrock */ 14452082Seschrock static int 14462082Seschrock raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 14472082Seschrock { 14482082Seschrock void *orig[VDEV_RAIDZ_MAXPARITY]; 14492082Seschrock int c, ret = 0; 14502082Seschrock raidz_col_t *rc; 14512082Seschrock 14522082Seschrock for (c = 0; c < rm->rm_firstdatacol; c++) { 14532082Seschrock rc = &rm->rm_col[c]; 14542082Seschrock if (!rc->rc_tried || rc->rc_error != 0) 14552082Seschrock continue; 14562082Seschrock orig[c] = zio_buf_alloc(rc->rc_size); 14572082Seschrock bcopy(rc->rc_data, orig[c], rc->rc_size); 14582082Seschrock } 14592082Seschrock 1460*10105Sadam.leventhal@sun.com vdev_raidz_generate_parity(rm); 14612082Seschrock 14622082Seschrock for (c = 0; c < rm->rm_firstdatacol; c++) { 14632082Seschrock rc = &rm->rm_col[c]; 14642082Seschrock if (!rc->rc_tried || rc->rc_error != 0) 14652082Seschrock continue; 14662082Seschrock if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 14672082Seschrock raidz_checksum_error(zio, rc); 14682082Seschrock rc->rc_error = ECKSUM; 14692082Seschrock ret++; 14702082Seschrock } 14712082Seschrock zio_buf_free(orig[c], rc->rc_size); 14722082Seschrock } 14732082Seschrock 14742082Seschrock return (ret); 14752082Seschrock } 14762082Seschrock 1477*10105Sadam.leventhal@sun.com /* 1478*10105Sadam.leventhal@sun.com * Keep statistics on all the ways that we used parity to correct data. 1479*10105Sadam.leventhal@sun.com */ 1480*10105Sadam.leventhal@sun.com static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; 14811544Seschrock 14825530Sbonwick static int 14837754SJeff.Bonwick@Sun.COM vdev_raidz_worst_error(raidz_map_t *rm) 14847754SJeff.Bonwick@Sun.COM { 14857754SJeff.Bonwick@Sun.COM int error = 0; 14867754SJeff.Bonwick@Sun.COM 14877754SJeff.Bonwick@Sun.COM for (int c = 0; c < rm->rm_cols; c++) 14887754SJeff.Bonwick@Sun.COM error = zio_worst_error(error, rm->rm_col[c].rc_error); 14897754SJeff.Bonwick@Sun.COM 14907754SJeff.Bonwick@Sun.COM return (error); 14917754SJeff.Bonwick@Sun.COM } 14927754SJeff.Bonwick@Sun.COM 1493*10105Sadam.leventhal@sun.com /* 1494*10105Sadam.leventhal@sun.com * Iterate over all combinations of bad data and attempt a reconstruction. 1495*10105Sadam.leventhal@sun.com * Note that the algorithm below is non-optimal because it doesn't take into 1496*10105Sadam.leventhal@sun.com * account how reconstruction is actually performed. For example, with 1497*10105Sadam.leventhal@sun.com * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1498*10105Sadam.leventhal@sun.com * is targeted as invalid as if columns 1 and 4 are targeted since in both 1499*10105Sadam.leventhal@sun.com * cases we'd only use parity information in column 0. 1500*10105Sadam.leventhal@sun.com */ 1501*10105Sadam.leventhal@sun.com static int 1502*10105Sadam.leventhal@sun.com vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 1503*10105Sadam.leventhal@sun.com { 1504*10105Sadam.leventhal@sun.com raidz_map_t *rm = zio->io_vsd; 1505*10105Sadam.leventhal@sun.com raidz_col_t *rc; 1506*10105Sadam.leventhal@sun.com void *orig[VDEV_RAIDZ_MAXPARITY]; 1507*10105Sadam.leventhal@sun.com int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1508*10105Sadam.leventhal@sun.com int *tgts = &tstore[1]; 1509*10105Sadam.leventhal@sun.com int current, next, i, c, n; 1510*10105Sadam.leventhal@sun.com int code, ret = 0; 1511*10105Sadam.leventhal@sun.com 1512*10105Sadam.leventhal@sun.com ASSERT(total_errors < rm->rm_firstdatacol); 1513*10105Sadam.leventhal@sun.com 1514*10105Sadam.leventhal@sun.com /* 1515*10105Sadam.leventhal@sun.com * This simplifies one edge condition. 1516*10105Sadam.leventhal@sun.com */ 1517*10105Sadam.leventhal@sun.com tgts[-1] = -1; 1518*10105Sadam.leventhal@sun.com 1519*10105Sadam.leventhal@sun.com for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1520*10105Sadam.leventhal@sun.com /* 1521*10105Sadam.leventhal@sun.com * Initialize the targets array by finding the first n columns 1522*10105Sadam.leventhal@sun.com * that contain no error. 1523*10105Sadam.leventhal@sun.com * 1524*10105Sadam.leventhal@sun.com * If there were no data errors, we need to ensure that we're 1525*10105Sadam.leventhal@sun.com * always explicitly attempting to reconstruct at least one 1526*10105Sadam.leventhal@sun.com * data column. To do this, we simply push the highest target 1527*10105Sadam.leventhal@sun.com * up into the data columns. 1528*10105Sadam.leventhal@sun.com */ 1529*10105Sadam.leventhal@sun.com for (c = 0, i = 0; i < n; i++) { 1530*10105Sadam.leventhal@sun.com if (i == n - 1 && data_errors == 0 && 1531*10105Sadam.leventhal@sun.com c < rm->rm_firstdatacol) { 1532*10105Sadam.leventhal@sun.com c = rm->rm_firstdatacol; 1533*10105Sadam.leventhal@sun.com } 1534*10105Sadam.leventhal@sun.com 1535*10105Sadam.leventhal@sun.com while (rm->rm_col[c].rc_error != 0) { 1536*10105Sadam.leventhal@sun.com c++; 1537*10105Sadam.leventhal@sun.com ASSERT3S(c, <, rm->rm_cols); 1538*10105Sadam.leventhal@sun.com } 1539*10105Sadam.leventhal@sun.com 1540*10105Sadam.leventhal@sun.com tgts[i] = c++; 1541*10105Sadam.leventhal@sun.com } 1542*10105Sadam.leventhal@sun.com 1543*10105Sadam.leventhal@sun.com /* 1544*10105Sadam.leventhal@sun.com * Setting tgts[n] simplifies the other edge condition. 1545*10105Sadam.leventhal@sun.com */ 1546*10105Sadam.leventhal@sun.com tgts[n] = rm->rm_cols; 1547*10105Sadam.leventhal@sun.com 1548*10105Sadam.leventhal@sun.com /* 1549*10105Sadam.leventhal@sun.com * These buffers were allocated in previous iterations. 1550*10105Sadam.leventhal@sun.com */ 1551*10105Sadam.leventhal@sun.com for (i = 0; i < n - 1; i++) { 1552*10105Sadam.leventhal@sun.com ASSERT(orig[i] != NULL); 1553*10105Sadam.leventhal@sun.com } 1554*10105Sadam.leventhal@sun.com 1555*10105Sadam.leventhal@sun.com orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); 1556*10105Sadam.leventhal@sun.com 1557*10105Sadam.leventhal@sun.com current = 0; 1558*10105Sadam.leventhal@sun.com next = tgts[current]; 1559*10105Sadam.leventhal@sun.com 1560*10105Sadam.leventhal@sun.com while (current != n) { 1561*10105Sadam.leventhal@sun.com tgts[current] = next; 1562*10105Sadam.leventhal@sun.com current = 0; 1563*10105Sadam.leventhal@sun.com 1564*10105Sadam.leventhal@sun.com /* 1565*10105Sadam.leventhal@sun.com * Save off the original data that we're going to 1566*10105Sadam.leventhal@sun.com * attempt to reconstruct. 1567*10105Sadam.leventhal@sun.com */ 1568*10105Sadam.leventhal@sun.com for (i = 0; i < n; i++) { 1569*10105Sadam.leventhal@sun.com ASSERT(orig[i] != NULL); 1570*10105Sadam.leventhal@sun.com c = tgts[i]; 1571*10105Sadam.leventhal@sun.com ASSERT3S(c, >=, 0); 1572*10105Sadam.leventhal@sun.com ASSERT3S(c, <, rm->rm_cols); 1573*10105Sadam.leventhal@sun.com rc = &rm->rm_col[c]; 1574*10105Sadam.leventhal@sun.com bcopy(rc->rc_data, orig[i], rc->rc_size); 1575*10105Sadam.leventhal@sun.com } 1576*10105Sadam.leventhal@sun.com 1577*10105Sadam.leventhal@sun.com /* 1578*10105Sadam.leventhal@sun.com * Attempt a reconstruction and exit the outer loop on 1579*10105Sadam.leventhal@sun.com * success. 1580*10105Sadam.leventhal@sun.com */ 1581*10105Sadam.leventhal@sun.com code = vdev_raidz_reconstruct(rm, tgts, n); 1582*10105Sadam.leventhal@sun.com if (zio_checksum_error(zio) == 0) { 1583*10105Sadam.leventhal@sun.com atomic_inc_64(&raidz_corrected[code]); 1584*10105Sadam.leventhal@sun.com 1585*10105Sadam.leventhal@sun.com for (i = 0; i < n; i++) { 1586*10105Sadam.leventhal@sun.com c = tgts[i]; 1587*10105Sadam.leventhal@sun.com rc = &rm->rm_col[c]; 1588*10105Sadam.leventhal@sun.com ASSERT(rc->rc_error == 0); 1589*10105Sadam.leventhal@sun.com if (rc->rc_tried) 1590*10105Sadam.leventhal@sun.com raidz_checksum_error(zio, rc); 1591*10105Sadam.leventhal@sun.com rc->rc_error = ECKSUM; 1592*10105Sadam.leventhal@sun.com } 1593*10105Sadam.leventhal@sun.com 1594*10105Sadam.leventhal@sun.com ret = code; 1595*10105Sadam.leventhal@sun.com goto done; 1596*10105Sadam.leventhal@sun.com } 1597*10105Sadam.leventhal@sun.com 1598*10105Sadam.leventhal@sun.com /* 1599*10105Sadam.leventhal@sun.com * Restore the original data. 1600*10105Sadam.leventhal@sun.com */ 1601*10105Sadam.leventhal@sun.com for (i = 0; i < n; i++) { 1602*10105Sadam.leventhal@sun.com c = tgts[i]; 1603*10105Sadam.leventhal@sun.com rc = &rm->rm_col[c]; 1604*10105Sadam.leventhal@sun.com bcopy(orig[i], rc->rc_data, rc->rc_size); 1605*10105Sadam.leventhal@sun.com } 1606*10105Sadam.leventhal@sun.com 1607*10105Sadam.leventhal@sun.com do { 1608*10105Sadam.leventhal@sun.com /* 1609*10105Sadam.leventhal@sun.com * Find the next valid column after the current 1610*10105Sadam.leventhal@sun.com * position.. 1611*10105Sadam.leventhal@sun.com */ 1612*10105Sadam.leventhal@sun.com for (next = tgts[current] + 1; 1613*10105Sadam.leventhal@sun.com next < rm->rm_cols && 1614*10105Sadam.leventhal@sun.com rm->rm_col[next].rc_error != 0; next++) 1615*10105Sadam.leventhal@sun.com continue; 1616*10105Sadam.leventhal@sun.com 1617*10105Sadam.leventhal@sun.com ASSERT(next <= tgts[current + 1]); 1618*10105Sadam.leventhal@sun.com 1619*10105Sadam.leventhal@sun.com /* 1620*10105Sadam.leventhal@sun.com * If that spot is available, we're done here. 1621*10105Sadam.leventhal@sun.com */ 1622*10105Sadam.leventhal@sun.com if (next != tgts[current + 1]) 1623*10105Sadam.leventhal@sun.com break; 1624*10105Sadam.leventhal@sun.com 1625*10105Sadam.leventhal@sun.com /* 1626*10105Sadam.leventhal@sun.com * Otherwise, find the next valid column after 1627*10105Sadam.leventhal@sun.com * the previous position. 1628*10105Sadam.leventhal@sun.com */ 1629*10105Sadam.leventhal@sun.com for (c = tgts[current - 1] + 1; 1630*10105Sadam.leventhal@sun.com rm->rm_col[c].rc_error != 0; c++) 1631*10105Sadam.leventhal@sun.com continue; 1632*10105Sadam.leventhal@sun.com 1633*10105Sadam.leventhal@sun.com tgts[current] = c; 1634*10105Sadam.leventhal@sun.com current++; 1635*10105Sadam.leventhal@sun.com 1636*10105Sadam.leventhal@sun.com } while (current != n); 1637*10105Sadam.leventhal@sun.com } 1638*10105Sadam.leventhal@sun.com } 1639*10105Sadam.leventhal@sun.com n--; 1640*10105Sadam.leventhal@sun.com done: 1641*10105Sadam.leventhal@sun.com for (i = 0; i < n; i++) { 1642*10105Sadam.leventhal@sun.com zio_buf_free(orig[i], rm->rm_col[0].rc_size); 1643*10105Sadam.leventhal@sun.com } 1644*10105Sadam.leventhal@sun.com 1645*10105Sadam.leventhal@sun.com return (ret); 1646*10105Sadam.leventhal@sun.com } 1647*10105Sadam.leventhal@sun.com 16487754SJeff.Bonwick@Sun.COM static void 1649789Sahrens vdev_raidz_io_done(zio_t *zio) 1650789Sahrens { 1651789Sahrens vdev_t *vd = zio->io_vd; 1652789Sahrens vdev_t *cvd; 1653789Sahrens raidz_map_t *rm = zio->io_vsd; 1654*10105Sadam.leventhal@sun.com raidz_col_t *rc; 1655789Sahrens int unexpected_errors = 0; 16562082Seschrock int parity_errors = 0; 16573456Sahl int parity_untried = 0; 16582082Seschrock int data_errors = 0; 16597754SJeff.Bonwick@Sun.COM int total_errors = 0; 1660*10105Sadam.leventhal@sun.com int n, c; 1661*10105Sadam.leventhal@sun.com int tgts[VDEV_RAIDZ_MAXPARITY]; 1662*10105Sadam.leventhal@sun.com int code; 1663789Sahrens 16641775Sbillm ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 1665789Sahrens 16662082Seschrock ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 16672082Seschrock ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 16682082Seschrock 1669789Sahrens for (c = 0; c < rm->rm_cols; c++) { 1670789Sahrens rc = &rm->rm_col[c]; 1671789Sahrens 1672789Sahrens if (rc->rc_error) { 16737754SJeff.Bonwick@Sun.COM ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 16742082Seschrock 16752082Seschrock if (c < rm->rm_firstdatacol) 16762082Seschrock parity_errors++; 16772082Seschrock else 16782082Seschrock data_errors++; 16792082Seschrock 1680789Sahrens if (!rc->rc_skipped) 1681789Sahrens unexpected_errors++; 16822082Seschrock 16837754SJeff.Bonwick@Sun.COM total_errors++; 16843456Sahl } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 16853456Sahl parity_untried++; 1686789Sahrens } 1687789Sahrens } 1688789Sahrens 1689789Sahrens if (zio->io_type == ZIO_TYPE_WRITE) { 1690789Sahrens /* 16917754SJeff.Bonwick@Sun.COM * XXX -- for now, treat partial writes as a success. 16927754SJeff.Bonwick@Sun.COM * (If we couldn't write enough columns to reconstruct 16937754SJeff.Bonwick@Sun.COM * the data, the I/O failed. Otherwise, good enough.) 16947754SJeff.Bonwick@Sun.COM * 16957754SJeff.Bonwick@Sun.COM * Now that we support write reallocation, it would be better 16967754SJeff.Bonwick@Sun.COM * to treat partial failure as real failure unless there are 16977754SJeff.Bonwick@Sun.COM * no non-degraded top-level vdevs left, and not update DTLs 16987754SJeff.Bonwick@Sun.COM * if we intend to reallocate. 1699789Sahrens */ 1700789Sahrens /* XXPOLICY */ 17017754SJeff.Bonwick@Sun.COM if (total_errors > rm->rm_firstdatacol) 17027754SJeff.Bonwick@Sun.COM zio->io_error = vdev_raidz_worst_error(rm); 1703789Sahrens 17047754SJeff.Bonwick@Sun.COM return; 1705789Sahrens } 1706789Sahrens 1707789Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ); 17082082Seschrock /* 17092082Seschrock * There are three potential phases for a read: 17102082Seschrock * 1. produce valid data from the columns read 17112082Seschrock * 2. read all disks and try again 17122082Seschrock * 3. perform combinatorial reconstruction 17132082Seschrock * 17142082Seschrock * Each phase is progressively both more expensive and less likely to 17152082Seschrock * occur. If we encounter more errors than we can repair or all phases 17162082Seschrock * fail, we have no choice but to return an error. 17172082Seschrock */ 1718789Sahrens 1719789Sahrens /* 17202082Seschrock * If the number of errors we saw was correctable -- less than or equal 17213456Sahl * to the number of parity disks read -- attempt to produce data that 17223456Sahl * has a valid checksum. Naturally, this case applies in the absence of 17233456Sahl * any errors. 1724789Sahrens */ 17257754SJeff.Bonwick@Sun.COM if (total_errors <= rm->rm_firstdatacol - parity_untried) { 1726*10105Sadam.leventhal@sun.com if (data_errors == 0) { 17272082Seschrock if (zio_checksum_error(zio) == 0) { 17284034Sahl /* 17294034Sahl * If we read parity information (unnecessarily 17304034Sahl * as it happens since no reconstruction was 17314034Sahl * needed) regenerate and verify the parity. 17324034Sahl * We also regenerate parity when resilvering 17334034Sahl * so we can write it out to the failed device 17344034Sahl * later. 17354034Sahl */ 17363456Sahl if (parity_errors + parity_untried < 17374034Sahl rm->rm_firstdatacol || 17384034Sahl (zio->io_flags & ZIO_FLAG_RESILVER)) { 17393456Sahl n = raidz_parity_verify(zio, rm); 17403456Sahl unexpected_errors += n; 17413456Sahl ASSERT(parity_errors + n <= 17423456Sahl rm->rm_firstdatacol); 17433456Sahl } 17442082Seschrock goto done; 17452082Seschrock } 1746*10105Sadam.leventhal@sun.com } else { 17473456Sahl /* 17483456Sahl * We either attempt to read all the parity columns or 17493456Sahl * none of them. If we didn't try to read parity, we 17503456Sahl * wouldn't be here in the correctable case. There must 17513456Sahl * also have been fewer parity errors than parity 17523456Sahl * columns or, again, we wouldn't be in this code path. 17533456Sahl */ 17543456Sahl ASSERT(parity_untried == 0); 17552082Seschrock ASSERT(parity_errors < rm->rm_firstdatacol); 17562082Seschrock 17572082Seschrock /* 1758*10105Sadam.leventhal@sun.com * Identify the data columns that reported an error. 17592082Seschrock */ 1760*10105Sadam.leventhal@sun.com n = 0; 17612082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 17622082Seschrock rc = &rm->rm_col[c]; 1763*10105Sadam.leventhal@sun.com if (rc->rc_error != 0) { 1764*10105Sadam.leventhal@sun.com ASSERT(n < VDEV_RAIDZ_MAXPARITY); 1765*10105Sadam.leventhal@sun.com tgts[n++] = c; 1766*10105Sadam.leventhal@sun.com } 17672082Seschrock } 17682082Seschrock 1769*10105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol >= n); 1770*10105Sadam.leventhal@sun.com 1771*10105Sadam.leventhal@sun.com code = vdev_raidz_reconstruct(rm, tgts, n); 17722082Seschrock 17732082Seschrock if (zio_checksum_error(zio) == 0) { 1774*10105Sadam.leventhal@sun.com atomic_inc_64(&raidz_corrected[code]); 1775789Sahrens 17762082Seschrock /* 1777*10105Sadam.leventhal@sun.com * If we read more parity disks than were used 1778*10105Sadam.leventhal@sun.com * for reconstruction, confirm that the other 1779*10105Sadam.leventhal@sun.com * parity disks produced correct data. This 1780*10105Sadam.leventhal@sun.com * routine is suboptimal in that it regenerates 1781*10105Sadam.leventhal@sun.com * the parity that we already used in addition 1782*10105Sadam.leventhal@sun.com * to the parity that we're attempting to 1783*10105Sadam.leventhal@sun.com * verify, but this should be a relatively 1784*10105Sadam.leventhal@sun.com * uncommon case, and can be optimized if it 1785*10105Sadam.leventhal@sun.com * becomes a problem. Note that we regenerate 1786*10105Sadam.leventhal@sun.com * parity when resilvering so we can write it 1787*10105Sadam.leventhal@sun.com * out to failed devices later. 17882082Seschrock */ 1789*10105Sadam.leventhal@sun.com if (parity_errors < rm->rm_firstdatacol - n || 17904034Sahl (zio->io_flags & ZIO_FLAG_RESILVER)) { 17912082Seschrock n = raidz_parity_verify(zio, rm); 17922082Seschrock unexpected_errors += n; 17932082Seschrock ASSERT(parity_errors + n <= 17942082Seschrock rm->rm_firstdatacol); 17952082Seschrock } 17962082Seschrock 17972082Seschrock goto done; 17982082Seschrock } 1799789Sahrens } 1800789Sahrens } 1801789Sahrens 1802789Sahrens /* 18032082Seschrock * This isn't a typical situation -- either we got a read error or 18042082Seschrock * a child silently returned bad data. Read every block so we can 18052082Seschrock * try again with as much data and parity as we can track down. If 18062082Seschrock * we've already been through once before, all children will be marked 18072082Seschrock * as tried so we'll proceed to combinatorial reconstruction. 1808789Sahrens */ 1809789Sahrens unexpected_errors = 1; 18102082Seschrock rm->rm_missingdata = 0; 18112082Seschrock rm->rm_missingparity = 0; 1812789Sahrens 18132082Seschrock for (c = 0; c < rm->rm_cols; c++) { 18142082Seschrock if (rm->rm_col[c].rc_tried) 18152082Seschrock continue; 1816789Sahrens 1817789Sahrens zio_vdev_io_redone(zio); 18182082Seschrock do { 1819789Sahrens rc = &rm->rm_col[c]; 1820789Sahrens if (rc->rc_tried) 1821789Sahrens continue; 1822789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, 18232082Seschrock vd->vdev_child[rc->rc_devidx], 1824789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 18257754SJeff.Bonwick@Sun.COM zio->io_type, zio->io_priority, 0, 1826789Sahrens vdev_raidz_child_done, rc)); 18272082Seschrock } while (++c < rm->rm_cols); 18285530Sbonwick 18297754SJeff.Bonwick@Sun.COM return; 1830789Sahrens } 1831789Sahrens 1832789Sahrens /* 18332082Seschrock * At this point we've attempted to reconstruct the data given the 18342082Seschrock * errors we detected, and we've attempted to read all columns. There 18352082Seschrock * must, therefore, be one or more additional problems -- silent errors 18362082Seschrock * resulting in invalid data rather than explicit I/O errors resulting 1837*10105Sadam.leventhal@sun.com * in absent data. We check if there is enough additional data to 1838*10105Sadam.leventhal@sun.com * possibly reconstruct the data and then perform combinatorial 1839*10105Sadam.leventhal@sun.com * reconstruction over all possible combinations. If that fails, 1840*10105Sadam.leventhal@sun.com * we're cooked. 1841789Sahrens */ 18427754SJeff.Bonwick@Sun.COM if (total_errors >= rm->rm_firstdatacol) { 18437754SJeff.Bonwick@Sun.COM zio->io_error = vdev_raidz_worst_error(rm); 18447754SJeff.Bonwick@Sun.COM /* 18457754SJeff.Bonwick@Sun.COM * If there were exactly as many device errors as parity 18467754SJeff.Bonwick@Sun.COM * columns, yet we couldn't reconstruct the data, then at 18477754SJeff.Bonwick@Sun.COM * least one device must have returned bad data silently. 18487754SJeff.Bonwick@Sun.COM */ 18497754SJeff.Bonwick@Sun.COM if (total_errors == rm->rm_firstdatacol) 18507754SJeff.Bonwick@Sun.COM zio->io_error = zio_worst_error(zio->io_error, ECKSUM); 18512082Seschrock 1852*10105Sadam.leventhal@sun.com } else if ((code = vdev_raidz_combrec(zio, total_errors, 1853*10105Sadam.leventhal@sun.com data_errors)) != 0) { 18542082Seschrock /* 1855*10105Sadam.leventhal@sun.com * If we didn't use all the available parity for the 1856*10105Sadam.leventhal@sun.com * combinatorial reconstruction, verify that the remaining 1857*10105Sadam.leventhal@sun.com * parity is correct. 18582082Seschrock */ 1859*10105Sadam.leventhal@sun.com if (code != (1 << rm->rm_firstdatacol) - 1) 1860*10105Sadam.leventhal@sun.com (void) raidz_parity_verify(zio, rm); 1861*10105Sadam.leventhal@sun.com } else { 1862*10105Sadam.leventhal@sun.com /* 1863*10105Sadam.leventhal@sun.com * All combinations failed to checksum. Generate checksum 1864*10105Sadam.leventhal@sun.com * ereports for all children. 1865*10105Sadam.leventhal@sun.com */ 1866*10105Sadam.leventhal@sun.com zio->io_error = ECKSUM; 18672082Seschrock 1868*10105Sadam.leventhal@sun.com if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1869*10105Sadam.leventhal@sun.com for (c = 0; c < rm->rm_cols; c++) { 1870*10105Sadam.leventhal@sun.com rc = &rm->rm_col[c]; 1871*10105Sadam.leventhal@sun.com zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 1872*10105Sadam.leventhal@sun.com zio->io_spa, vd->vdev_child[rc->rc_devidx], 1873*10105Sadam.leventhal@sun.com zio, rc->rc_offset, rc->rc_size); 18742082Seschrock } 18751544Seschrock } 18761544Seschrock } 1877789Sahrens 1878789Sahrens done: 1879789Sahrens zio_checksum_verified(zio); 1880789Sahrens 18818241SJeff.Bonwick@Sun.COM if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 1882789Sahrens (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 1883789Sahrens /* 1884789Sahrens * Use the good data we have in hand to repair damaged children. 1885789Sahrens */ 1886789Sahrens for (c = 0; c < rm->rm_cols; c++) { 1887789Sahrens rc = &rm->rm_col[c]; 18882082Seschrock cvd = vd->vdev_child[rc->rc_devidx]; 1889789Sahrens 18901732Sbonwick if (rc->rc_error == 0) 18911732Sbonwick continue; 18921732Sbonwick 18937754SJeff.Bonwick@Sun.COM zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 18941732Sbonwick rc->rc_offset, rc->rc_data, rc->rc_size, 18951732Sbonwick ZIO_TYPE_WRITE, zio->io_priority, 18968241SJeff.Bonwick@Sun.COM ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 18978241SJeff.Bonwick@Sun.COM ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 18981732Sbonwick } 1899789Sahrens } 1900789Sahrens } 1901789Sahrens 1902789Sahrens static void 1903789Sahrens vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 1904789Sahrens { 19052082Seschrock if (faulted > vd->vdev_nparity) 19061544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 19071544Seschrock VDEV_AUX_NO_REPLICAS); 1908789Sahrens else if (degraded + faulted != 0) 19091544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 1910789Sahrens else 19111544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 1912789Sahrens } 1913789Sahrens 1914789Sahrens vdev_ops_t vdev_raidz_ops = { 1915789Sahrens vdev_raidz_open, 1916789Sahrens vdev_raidz_close, 1917789Sahrens vdev_raidz_asize, 1918789Sahrens vdev_raidz_io_start, 1919789Sahrens vdev_raidz_io_done, 1920789Sahrens vdev_raidz_state_change, 1921789Sahrens VDEV_TYPE_RAIDZ, /* name of this vdev type */ 1922789Sahrens B_FALSE /* not a leaf vdev */ 1923789Sahrens }; 1924