1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 212082Seschrock 22789Sahrens /* 239434SMark.Musante@Sun.COM * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24789Sahrens * Use is subject to license terms. 25789Sahrens */ 26789Sahrens 27789Sahrens #include <sys/zfs_context.h> 28789Sahrens #include <sys/spa.h> 29789Sahrens #include <sys/vdev_impl.h> 30789Sahrens #include <sys/zio.h> 31789Sahrens #include <sys/zio_checksum.h> 32789Sahrens #include <sys/fs/zfs.h> 331544Seschrock #include <sys/fm/fs/zfs.h> 34789Sahrens 35789Sahrens /* 36789Sahrens * Virtual device vector for RAID-Z. 372082Seschrock * 3810105Sadam.leventhal@sun.com * This vdev supports single, double, and triple parity. For single parity, 3910105Sadam.leventhal@sun.com * we use a simple XOR of all the data columns. For double or triple parity, 4010105Sadam.leventhal@sun.com * we use a special case of Reed-Solomon coding. This extends the 4110105Sadam.leventhal@sun.com * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 4210105Sadam.leventhal@sun.com * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 4310105Sadam.leventhal@sun.com * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 4410105Sadam.leventhal@sun.com * former is also based. The latter is designed to provide higher performance 4510105Sadam.leventhal@sun.com * for writes. 4610105Sadam.leventhal@sun.com * 4710105Sadam.leventhal@sun.com * Note that the Plank paper claimed to support arbitrary N+M, but was then 4810105Sadam.leventhal@sun.com * amended six years later identifying a critical flaw that invalidates its 4910105Sadam.leventhal@sun.com * claims. Nevertheless, the technique can be adapted to work for up to 5010105Sadam.leventhal@sun.com * triple parity. For additional parity, the amendment "Note: Correction to 5110105Sadam.leventhal@sun.com * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 5210105Sadam.leventhal@sun.com * is viable, but the additional complexity means that write performance will 5310105Sadam.leventhal@sun.com * suffer. 5410105Sadam.leventhal@sun.com * 5510105Sadam.leventhal@sun.com * All of the methods above operate on a Galois field, defined over the 5610105Sadam.leventhal@sun.com * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 5710105Sadam.leventhal@sun.com * can be expressed with a single byte. Briefly, the operations on the 5810105Sadam.leventhal@sun.com * field are defined as follows: 592082Seschrock * 602082Seschrock * o addition (+) is represented by a bitwise XOR 612082Seschrock * o subtraction (-) is therefore identical to addition: A + B = A - B 622082Seschrock * o multiplication of A by 2 is defined by the following bitwise expression: 632082Seschrock * (A * 2)_7 = A_6 642082Seschrock * (A * 2)_6 = A_5 652082Seschrock * (A * 2)_5 = A_4 662082Seschrock * (A * 2)_4 = A_3 + A_7 672082Seschrock * (A * 2)_3 = A_2 + A_7 682082Seschrock * (A * 2)_2 = A_1 + A_7 692082Seschrock * (A * 2)_1 = A_0 702082Seschrock * (A * 2)_0 = A_7 712082Seschrock * 722082Seschrock * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 7310105Sadam.leventhal@sun.com * As an aside, this multiplication is derived from the error correcting 7410105Sadam.leventhal@sun.com * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 752082Seschrock * 762082Seschrock * Observe that any number in the field (except for 0) can be expressed as a 772082Seschrock * power of 2 -- a generator for the field. We store a table of the powers of 782082Seschrock * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 792082Seschrock * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 8010105Sadam.leventhal@sun.com * than field addition). The inverse of a field element A (A^-1) is therefore 8110105Sadam.leventhal@sun.com * A ^ (255 - 1) = A^254. 822082Seschrock * 8310105Sadam.leventhal@sun.com * The up-to-three parity columns, P, Q, R over several data columns, 8410105Sadam.leventhal@sun.com * D_0, ... D_n-1, can be expressed by field operations: 852082Seschrock * 862082Seschrock * P = D_0 + D_1 + ... + D_n-2 + D_n-1 872082Seschrock * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 882082Seschrock * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 8910105Sadam.leventhal@sun.com * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 9010105Sadam.leventhal@sun.com * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 912082Seschrock * 9210105Sadam.leventhal@sun.com * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 9310105Sadam.leventhal@sun.com * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 9410105Sadam.leventhal@sun.com * independent coefficients. (There are no additional coefficients that have 9510105Sadam.leventhal@sun.com * this property which is why the uncorrected Plank method breaks down.) 9610105Sadam.leventhal@sun.com * 9710105Sadam.leventhal@sun.com * See the reconstruction code below for how P, Q and R can used individually 9810105Sadam.leventhal@sun.com * or in concert to recover missing data columns. 99789Sahrens */ 100789Sahrens 101789Sahrens typedef struct raidz_col { 1022082Seschrock uint64_t rc_devidx; /* child device index for I/O */ 1032082Seschrock uint64_t rc_offset; /* device offset */ 1042082Seschrock uint64_t rc_size; /* I/O size */ 1052082Seschrock void *rc_data; /* I/O data */ 1062082Seschrock int rc_error; /* I/O error for this device */ 1072082Seschrock uint8_t rc_tried; /* Did we attempt this I/O column? */ 1082082Seschrock uint8_t rc_skipped; /* Did we skip this I/O column? */ 109789Sahrens } raidz_col_t; 110789Sahrens 111789Sahrens typedef struct raidz_map { 11210105Sadam.leventhal@sun.com uint64_t rm_cols; /* Regular column count */ 11310105Sadam.leventhal@sun.com uint64_t rm_scols; /* Count including skipped columns */ 1142082Seschrock uint64_t rm_bigcols; /* Number of oversized columns */ 1152082Seschrock uint64_t rm_asize; /* Actual total I/O size */ 1162082Seschrock uint64_t rm_missingdata; /* Count of missing data devices */ 1172082Seschrock uint64_t rm_missingparity; /* Count of missing parity devices */ 1182082Seschrock uint64_t rm_firstdatacol; /* First data column/parity count */ 119*10450Sadam.leventhal@sun.com uint64_t rm_nskip; /* Skipped sectors for padding */ 120*10450Sadam.leventhal@sun.com uint64_t rm_skipstart; /* Column index of padding start */ 1212082Seschrock raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 122789Sahrens } raidz_map_t; 123789Sahrens 1242082Seschrock #define VDEV_RAIDZ_P 0 1252082Seschrock #define VDEV_RAIDZ_Q 1 12610105Sadam.leventhal@sun.com #define VDEV_RAIDZ_R 2 12710105Sadam.leventhal@sun.com #define VDEV_RAIDZ_MAXPARITY 3 1282082Seschrock 12910105Sadam.leventhal@sun.com #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 13010105Sadam.leventhal@sun.com #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 1312082Seschrock 13210105Sadam.leventhal@sun.com /* 13310105Sadam.leventhal@sun.com * We provide a mechanism to perform the field multiplication operation on a 13410105Sadam.leventhal@sun.com * 64-bit value all at once rather than a byte at a time. This works by 13510105Sadam.leventhal@sun.com * creating a mask from the top bit in each byte and using that to 13610105Sadam.leventhal@sun.com * conditionally apply the XOR of 0x1d. 13710105Sadam.leventhal@sun.com */ 13810105Sadam.leventhal@sun.com #define VDEV_RAIDZ_64MUL_2(x, mask) \ 13910105Sadam.leventhal@sun.com { \ 14010105Sadam.leventhal@sun.com (mask) = (x) & 0x8080808080808080ULL; \ 14110105Sadam.leventhal@sun.com (mask) = ((mask) << 1) - ((mask) >> 7); \ 14210105Sadam.leventhal@sun.com (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 14310105Sadam.leventhal@sun.com ((mask) & 0x1d1d1d1d1d1d1d1d); \ 14410105Sadam.leventhal@sun.com } 14510105Sadam.leventhal@sun.com 14610105Sadam.leventhal@sun.com #define VDEV_RAIDZ_64MUL_4(x, mask) \ 14710105Sadam.leventhal@sun.com { \ 14810105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2((x), mask); \ 14910105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2((x), mask); \ 15010105Sadam.leventhal@sun.com } 15110105Sadam.leventhal@sun.com 15210105Sadam.leventhal@sun.com /* 15310105Sadam.leventhal@sun.com * Force reconstruction to use the general purpose method. 15410105Sadam.leventhal@sun.com */ 15510105Sadam.leventhal@sun.com int vdev_raidz_default_to_general; 1562082Seschrock 1572082Seschrock /* 1582082Seschrock * These two tables represent powers and logs of 2 in the Galois field defined 1592082Seschrock * above. These values were computed by repeatedly multiplying by 2 as above. 1602082Seschrock */ 1612082Seschrock static const uint8_t vdev_raidz_pow2[256] = { 1622082Seschrock 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 1632082Seschrock 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 1642082Seschrock 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 1652082Seschrock 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 1662082Seschrock 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 1672082Seschrock 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 1682082Seschrock 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 1692082Seschrock 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 1702082Seschrock 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 1712082Seschrock 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 1722082Seschrock 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 1732082Seschrock 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 1742082Seschrock 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 1752082Seschrock 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 1762082Seschrock 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 1772082Seschrock 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 1782082Seschrock 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 1792082Seschrock 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 1802082Seschrock 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 1812082Seschrock 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 1822082Seschrock 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 1832082Seschrock 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 1842082Seschrock 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 1852082Seschrock 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 1862082Seschrock 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 1872082Seschrock 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 1882082Seschrock 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 1892082Seschrock 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 1902082Seschrock 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 1912082Seschrock 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 1922082Seschrock 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 1932082Seschrock 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 1942082Seschrock }; 1952082Seschrock static const uint8_t vdev_raidz_log2[256] = { 1962082Seschrock 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 1972082Seschrock 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 1982082Seschrock 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 1992082Seschrock 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 2002082Seschrock 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 2012082Seschrock 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 2022082Seschrock 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 2032082Seschrock 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 2042082Seschrock 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 2052082Seschrock 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 2062082Seschrock 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 2072082Seschrock 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 2082082Seschrock 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 2092082Seschrock 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 2102082Seschrock 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 2112082Seschrock 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 2122082Seschrock 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 2132082Seschrock 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 2142082Seschrock 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 2152082Seschrock 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 2162082Seschrock 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 2172082Seschrock 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 2182082Seschrock 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 2192082Seschrock 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 2202082Seschrock 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 2212082Seschrock 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 2222082Seschrock 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 2232082Seschrock 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 2242082Seschrock 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 2252082Seschrock 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 2262082Seschrock 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 2272082Seschrock 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 2282082Seschrock }; 2292082Seschrock 2302082Seschrock /* 2312082Seschrock * Multiply a given number by 2 raised to the given power. 2322082Seschrock */ 2332082Seschrock static uint8_t 2342082Seschrock vdev_raidz_exp2(uint_t a, int exp) 2352082Seschrock { 2362082Seschrock if (a == 0) 2372082Seschrock return (0); 2382082Seschrock 2392082Seschrock ASSERT(exp >= 0); 2402082Seschrock ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 2412082Seschrock 2422082Seschrock exp += vdev_raidz_log2[a]; 2432082Seschrock if (exp > 255) 2442082Seschrock exp -= 255; 2452082Seschrock 2462082Seschrock return (vdev_raidz_pow2[exp]); 2472082Seschrock } 2482082Seschrock 2497754SJeff.Bonwick@Sun.COM static void 2507754SJeff.Bonwick@Sun.COM vdev_raidz_map_free(zio_t *zio) 2517754SJeff.Bonwick@Sun.COM { 2527754SJeff.Bonwick@Sun.COM raidz_map_t *rm = zio->io_vsd; 2537754SJeff.Bonwick@Sun.COM int c; 2547754SJeff.Bonwick@Sun.COM 2557754SJeff.Bonwick@Sun.COM for (c = 0; c < rm->rm_firstdatacol; c++) 2567754SJeff.Bonwick@Sun.COM zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 2577754SJeff.Bonwick@Sun.COM 25810105Sadam.leventhal@sun.com kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 2597754SJeff.Bonwick@Sun.COM } 2607754SJeff.Bonwick@Sun.COM 261789Sahrens static raidz_map_t * 2622082Seschrock vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, 2632082Seschrock uint64_t nparity) 264789Sahrens { 265789Sahrens raidz_map_t *rm; 266789Sahrens uint64_t b = zio->io_offset >> unit_shift; 267789Sahrens uint64_t s = zio->io_size >> unit_shift; 268789Sahrens uint64_t f = b % dcols; 269789Sahrens uint64_t o = (b / dcols) << unit_shift; 27010105Sadam.leventhal@sun.com uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 271789Sahrens 2722082Seschrock q = s / (dcols - nparity); 2732082Seschrock r = s - q * (dcols - nparity); 2742082Seschrock bc = (r == 0 ? 0 : r + nparity); 27510105Sadam.leventhal@sun.com tot = s + nparity * (q + (r == 0 ? 0 : 1)); 276789Sahrens 27710105Sadam.leventhal@sun.com if (q == 0) { 27810105Sadam.leventhal@sun.com acols = bc; 27910105Sadam.leventhal@sun.com scols = MIN(dcols, roundup(bc, nparity + 1)); 28010105Sadam.leventhal@sun.com } else { 28110105Sadam.leventhal@sun.com acols = dcols; 28210105Sadam.leventhal@sun.com scols = dcols; 28310105Sadam.leventhal@sun.com } 284789Sahrens 28510105Sadam.leventhal@sun.com ASSERT3U(acols, <=, scols); 28610105Sadam.leventhal@sun.com 28710105Sadam.leventhal@sun.com rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 288789Sahrens 289789Sahrens rm->rm_cols = acols; 29010105Sadam.leventhal@sun.com rm->rm_scols = scols; 291789Sahrens rm->rm_bigcols = bc; 292*10450Sadam.leventhal@sun.com rm->rm_skipstart = bc; 2932082Seschrock rm->rm_missingdata = 0; 2942082Seschrock rm->rm_missingparity = 0; 2952082Seschrock rm->rm_firstdatacol = nparity; 296789Sahrens 29710105Sadam.leventhal@sun.com asize = 0; 29810105Sadam.leventhal@sun.com 29910105Sadam.leventhal@sun.com for (c = 0; c < scols; c++) { 300789Sahrens col = f + c; 301789Sahrens coff = o; 302789Sahrens if (col >= dcols) { 303789Sahrens col -= dcols; 304789Sahrens coff += 1ULL << unit_shift; 305789Sahrens } 3062082Seschrock rm->rm_col[c].rc_devidx = col; 307789Sahrens rm->rm_col[c].rc_offset = coff; 308789Sahrens rm->rm_col[c].rc_data = NULL; 309789Sahrens rm->rm_col[c].rc_error = 0; 310789Sahrens rm->rm_col[c].rc_tried = 0; 311789Sahrens rm->rm_col[c].rc_skipped = 0; 31210105Sadam.leventhal@sun.com 31310105Sadam.leventhal@sun.com if (c >= acols) 31410105Sadam.leventhal@sun.com rm->rm_col[c].rc_size = 0; 31510105Sadam.leventhal@sun.com else if (c < bc) 31610105Sadam.leventhal@sun.com rm->rm_col[c].rc_size = (q + 1) << unit_shift; 31710105Sadam.leventhal@sun.com else 31810105Sadam.leventhal@sun.com rm->rm_col[c].rc_size = q << unit_shift; 31910105Sadam.leventhal@sun.com 32010105Sadam.leventhal@sun.com asize += rm->rm_col[c].rc_size; 321789Sahrens } 322789Sahrens 32310105Sadam.leventhal@sun.com ASSERT3U(asize, ==, tot << unit_shift); 32410105Sadam.leventhal@sun.com rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 325*10450Sadam.leventhal@sun.com rm->rm_nskip = roundup(tot, nparity + 1) - tot; 326*10450Sadam.leventhal@sun.com ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 327*10450Sadam.leventhal@sun.com ASSERT3U(rm->rm_nskip, <=, nparity); 328789Sahrens 329789Sahrens for (c = 0; c < rm->rm_firstdatacol; c++) 330789Sahrens rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); 331789Sahrens 332789Sahrens rm->rm_col[c].rc_data = zio->io_data; 333789Sahrens 334789Sahrens for (c = c + 1; c < acols; c++) 335789Sahrens rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 336789Sahrens rm->rm_col[c - 1].rc_size; 337789Sahrens 3381133Seschrock /* 3392082Seschrock * If all data stored spans all columns, there's a danger that parity 3402082Seschrock * will always be on the same device and, since parity isn't read 3412082Seschrock * during normal operation, that that device's I/O bandwidth won't be 3422082Seschrock * used effectively. We therefore switch the parity every 1MB. 3432082Seschrock * 3442082Seschrock * ... at least that was, ostensibly, the theory. As a practical 3452082Seschrock * matter unless we juggle the parity between all devices evenly, we 3462082Seschrock * won't see any benefit. Further, occasional writes that aren't a 3472082Seschrock * multiple of the LCM of the number of children and the minimum 3482082Seschrock * stripe width are sufficient to avoid pessimal behavior. 3492082Seschrock * Unfortunately, this decision created an implicit on-disk format 3503456Sahl * requirement that we need to support for all eternity, but only 3513456Sahl * for single-parity RAID-Z. 352*10450Sadam.leventhal@sun.com * 353*10450Sadam.leventhal@sun.com * If we intend to skip a sector in the zeroth column for padding 354*10450Sadam.leventhal@sun.com * we must make sure to note this swap. We will never intend to 355*10450Sadam.leventhal@sun.com * skip the first column since at least one data and one parity 356*10450Sadam.leventhal@sun.com * column must appear in each row. 3571133Seschrock */ 3581133Seschrock ASSERT(rm->rm_cols >= 2); 3591133Seschrock ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 360789Sahrens 3612082Seschrock if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 3622082Seschrock devidx = rm->rm_col[0].rc_devidx; 3631133Seschrock o = rm->rm_col[0].rc_offset; 3642082Seschrock rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 3651133Seschrock rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 3662082Seschrock rm->rm_col[1].rc_devidx = devidx; 3671133Seschrock rm->rm_col[1].rc_offset = o; 368*10450Sadam.leventhal@sun.com 369*10450Sadam.leventhal@sun.com if (rm->rm_skipstart == 0) 370*10450Sadam.leventhal@sun.com rm->rm_skipstart = 1; 371789Sahrens } 372789Sahrens 373789Sahrens zio->io_vsd = rm; 3747754SJeff.Bonwick@Sun.COM zio->io_vsd_free = vdev_raidz_map_free; 375789Sahrens return (rm); 376789Sahrens } 377789Sahrens 378789Sahrens static void 3792082Seschrock vdev_raidz_generate_parity_p(raidz_map_t *rm) 3802082Seschrock { 3812082Seschrock uint64_t *p, *src, pcount, ccount, i; 3822082Seschrock int c; 3832082Seschrock 3842082Seschrock pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 3852082Seschrock 3862082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 3872082Seschrock src = rm->rm_col[c].rc_data; 3882082Seschrock p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 3892082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 3902082Seschrock 3912082Seschrock if (c == rm->rm_firstdatacol) { 3922082Seschrock ASSERT(ccount == pcount); 39310105Sadam.leventhal@sun.com for (i = 0; i < ccount; i++, src++, p++) { 3942082Seschrock *p = *src; 3952082Seschrock } 3962082Seschrock } else { 3972082Seschrock ASSERT(ccount <= pcount); 39810105Sadam.leventhal@sun.com for (i = 0; i < ccount; i++, src++, p++) { 3992082Seschrock *p ^= *src; 4002082Seschrock } 4012082Seschrock } 4022082Seschrock } 4032082Seschrock } 4042082Seschrock 4052082Seschrock static void 4062082Seschrock vdev_raidz_generate_parity_pq(raidz_map_t *rm) 407789Sahrens { 40810105Sadam.leventhal@sun.com uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 4092082Seschrock int c; 4102082Seschrock 41110105Sadam.leventhal@sun.com pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 4122082Seschrock ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 4132082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_size); 4142082Seschrock 4152082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 4162082Seschrock src = rm->rm_col[c].rc_data; 4172082Seschrock p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 4182082Seschrock q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 41910105Sadam.leventhal@sun.com 42010105Sadam.leventhal@sun.com ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 4212082Seschrock 4222082Seschrock if (c == rm->rm_firstdatacol) { 42310105Sadam.leventhal@sun.com ASSERT(ccnt == pcnt || ccnt == 0); 42410105Sadam.leventhal@sun.com for (i = 0; i < ccnt; i++, src++, p++, q++) { 4252082Seschrock *p = *src; 42610105Sadam.leventhal@sun.com *q = *src; 4272082Seschrock } 42810105Sadam.leventhal@sun.com for (; i < pcnt; i++, src++, p++, q++) { 42910105Sadam.leventhal@sun.com *p = 0; 4302082Seschrock *q = 0; 4312082Seschrock } 4322082Seschrock } else { 43310105Sadam.leventhal@sun.com ASSERT(ccnt <= pcnt); 434789Sahrens 4352082Seschrock /* 43610105Sadam.leventhal@sun.com * Apply the algorithm described above by multiplying 43710105Sadam.leventhal@sun.com * the previous result and adding in the new value. 4382082Seschrock */ 43910105Sadam.leventhal@sun.com for (i = 0; i < ccnt; i++, src++, p++, q++) { 44010105Sadam.leventhal@sun.com *p ^= *src; 44110105Sadam.leventhal@sun.com 44210105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*q, mask); 4432082Seschrock *q ^= *src; 4442082Seschrock } 4452082Seschrock 4462082Seschrock /* 4472082Seschrock * Treat short columns as though they are full of 0s. 44810105Sadam.leventhal@sun.com * Note that there's therefore nothing needed for P. 4492082Seschrock */ 45010105Sadam.leventhal@sun.com for (; i < pcnt; i++, q++) { 45110105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*q, mask); 4522082Seschrock } 4532082Seschrock } 4542082Seschrock } 4552082Seschrock } 4562082Seschrock 4572082Seschrock static void 45810105Sadam.leventhal@sun.com vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 45910105Sadam.leventhal@sun.com { 46010105Sadam.leventhal@sun.com uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 46110105Sadam.leventhal@sun.com int c; 46210105Sadam.leventhal@sun.com 46310105Sadam.leventhal@sun.com pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 46410105Sadam.leventhal@sun.com ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 46510105Sadam.leventhal@sun.com rm->rm_col[VDEV_RAIDZ_Q].rc_size); 46610105Sadam.leventhal@sun.com ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 46710105Sadam.leventhal@sun.com rm->rm_col[VDEV_RAIDZ_R].rc_size); 46810105Sadam.leventhal@sun.com 46910105Sadam.leventhal@sun.com for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 47010105Sadam.leventhal@sun.com src = rm->rm_col[c].rc_data; 47110105Sadam.leventhal@sun.com p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 47210105Sadam.leventhal@sun.com q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 47310105Sadam.leventhal@sun.com r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 47410105Sadam.leventhal@sun.com 47510105Sadam.leventhal@sun.com ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 47610105Sadam.leventhal@sun.com 47710105Sadam.leventhal@sun.com if (c == rm->rm_firstdatacol) { 47810105Sadam.leventhal@sun.com ASSERT(ccnt == pcnt || ccnt == 0); 47910105Sadam.leventhal@sun.com for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 48010105Sadam.leventhal@sun.com *p = *src; 48110105Sadam.leventhal@sun.com *q = *src; 48210105Sadam.leventhal@sun.com *r = *src; 48310105Sadam.leventhal@sun.com } 48410105Sadam.leventhal@sun.com for (; i < pcnt; i++, src++, p++, q++, r++) { 48510105Sadam.leventhal@sun.com *p = 0; 48610105Sadam.leventhal@sun.com *q = 0; 48710105Sadam.leventhal@sun.com *r = 0; 48810105Sadam.leventhal@sun.com } 48910105Sadam.leventhal@sun.com } else { 49010105Sadam.leventhal@sun.com ASSERT(ccnt <= pcnt); 49110105Sadam.leventhal@sun.com 49210105Sadam.leventhal@sun.com /* 49310105Sadam.leventhal@sun.com * Apply the algorithm described above by multiplying 49410105Sadam.leventhal@sun.com * the previous result and adding in the new value. 49510105Sadam.leventhal@sun.com */ 49610105Sadam.leventhal@sun.com for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 49710105Sadam.leventhal@sun.com *p ^= *src; 49810105Sadam.leventhal@sun.com 49910105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*q, mask); 50010105Sadam.leventhal@sun.com *q ^= *src; 50110105Sadam.leventhal@sun.com 50210105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_4(*r, mask); 50310105Sadam.leventhal@sun.com *r ^= *src; 50410105Sadam.leventhal@sun.com } 50510105Sadam.leventhal@sun.com 50610105Sadam.leventhal@sun.com /* 50710105Sadam.leventhal@sun.com * Treat short columns as though they are full of 0s. 50810105Sadam.leventhal@sun.com * Note that there's therefore nothing needed for P. 50910105Sadam.leventhal@sun.com */ 51010105Sadam.leventhal@sun.com for (; i < pcnt; i++, q++, r++) { 51110105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*q, mask); 51210105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_4(*r, mask); 51310105Sadam.leventhal@sun.com } 51410105Sadam.leventhal@sun.com } 51510105Sadam.leventhal@sun.com } 51610105Sadam.leventhal@sun.com } 51710105Sadam.leventhal@sun.com 51810105Sadam.leventhal@sun.com /* 51910105Sadam.leventhal@sun.com * Generate RAID parity in the first virtual columns according to the number of 52010105Sadam.leventhal@sun.com * parity columns available. 52110105Sadam.leventhal@sun.com */ 52210105Sadam.leventhal@sun.com static void 52310105Sadam.leventhal@sun.com vdev_raidz_generate_parity(raidz_map_t *rm) 52410105Sadam.leventhal@sun.com { 52510105Sadam.leventhal@sun.com switch (rm->rm_firstdatacol) { 52610105Sadam.leventhal@sun.com case 1: 52710105Sadam.leventhal@sun.com vdev_raidz_generate_parity_p(rm); 52810105Sadam.leventhal@sun.com break; 52910105Sadam.leventhal@sun.com case 2: 53010105Sadam.leventhal@sun.com vdev_raidz_generate_parity_pq(rm); 53110105Sadam.leventhal@sun.com break; 53210105Sadam.leventhal@sun.com case 3: 53310105Sadam.leventhal@sun.com vdev_raidz_generate_parity_pqr(rm); 53410105Sadam.leventhal@sun.com break; 53510105Sadam.leventhal@sun.com default: 53610105Sadam.leventhal@sun.com cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 53710105Sadam.leventhal@sun.com } 53810105Sadam.leventhal@sun.com } 53910105Sadam.leventhal@sun.com 54010105Sadam.leventhal@sun.com static int 54110105Sadam.leventhal@sun.com vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 5422082Seschrock { 5432082Seschrock uint64_t *dst, *src, xcount, ccount, count, i; 54410105Sadam.leventhal@sun.com int x = tgts[0]; 5452082Seschrock int c; 5462082Seschrock 54710105Sadam.leventhal@sun.com ASSERT(ntgts == 1); 54810105Sadam.leventhal@sun.com ASSERT(x >= rm->rm_firstdatacol); 54910105Sadam.leventhal@sun.com ASSERT(x < rm->rm_cols); 55010105Sadam.leventhal@sun.com 5512082Seschrock xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 5522082Seschrock ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 5532082Seschrock ASSERT(xcount > 0); 5542082Seschrock 5552082Seschrock src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 5562082Seschrock dst = rm->rm_col[x].rc_data; 5572082Seschrock for (i = 0; i < xcount; i++, dst++, src++) { 5582082Seschrock *dst = *src; 5592082Seschrock } 5602082Seschrock 5612082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 562789Sahrens src = rm->rm_col[c].rc_data; 563789Sahrens dst = rm->rm_col[x].rc_data; 5642082Seschrock 5652082Seschrock if (c == x) 5662082Seschrock continue; 5672082Seschrock 5682082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 5692082Seschrock count = MIN(ccount, xcount); 5702082Seschrock 5712082Seschrock for (i = 0; i < count; i++, dst++, src++) { 5722082Seschrock *dst ^= *src; 573789Sahrens } 574789Sahrens } 57510105Sadam.leventhal@sun.com 57610105Sadam.leventhal@sun.com return (1 << VDEV_RAIDZ_P); 577789Sahrens } 578789Sahrens 57910105Sadam.leventhal@sun.com static int 58010105Sadam.leventhal@sun.com vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 5812082Seschrock { 5822082Seschrock uint64_t *dst, *src, xcount, ccount, count, mask, i; 5832082Seschrock uint8_t *b; 58410105Sadam.leventhal@sun.com int x = tgts[0]; 5852082Seschrock int c, j, exp; 5862082Seschrock 58710105Sadam.leventhal@sun.com ASSERT(ntgts == 1); 58810105Sadam.leventhal@sun.com 5892082Seschrock xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 5902082Seschrock ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 5912082Seschrock 5922082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 5932082Seschrock src = rm->rm_col[c].rc_data; 5942082Seschrock dst = rm->rm_col[x].rc_data; 5952082Seschrock 5962082Seschrock if (c == x) 5972082Seschrock ccount = 0; 5982082Seschrock else 5992082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 6002082Seschrock 6012082Seschrock count = MIN(ccount, xcount); 6022082Seschrock 6032082Seschrock if (c == rm->rm_firstdatacol) { 6042082Seschrock for (i = 0; i < count; i++, dst++, src++) { 6052082Seschrock *dst = *src; 6062082Seschrock } 6072082Seschrock for (; i < xcount; i++, dst++) { 6082082Seschrock *dst = 0; 6092082Seschrock } 6102082Seschrock 6112082Seschrock } else { 6122082Seschrock for (i = 0; i < count; i++, dst++, src++) { 61310105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*dst, mask); 6142082Seschrock *dst ^= *src; 6152082Seschrock } 6162082Seschrock 6172082Seschrock for (; i < xcount; i++, dst++) { 61810105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*dst, mask); 6192082Seschrock } 6202082Seschrock } 6212082Seschrock } 6222082Seschrock 6232082Seschrock src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 6242082Seschrock dst = rm->rm_col[x].rc_data; 6252082Seschrock exp = 255 - (rm->rm_cols - 1 - x); 6262082Seschrock 6272082Seschrock for (i = 0; i < xcount; i++, dst++, src++) { 6282082Seschrock *dst ^= *src; 6292082Seschrock for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 6302082Seschrock *b = vdev_raidz_exp2(*b, exp); 6312082Seschrock } 6322082Seschrock } 63310105Sadam.leventhal@sun.com 63410105Sadam.leventhal@sun.com return (1 << VDEV_RAIDZ_Q); 6352082Seschrock } 6362082Seschrock 63710105Sadam.leventhal@sun.com static int 63810105Sadam.leventhal@sun.com vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 6392082Seschrock { 6402082Seschrock uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 6412082Seschrock void *pdata, *qdata; 6422082Seschrock uint64_t xsize, ysize, i; 64310105Sadam.leventhal@sun.com int x = tgts[0]; 64410105Sadam.leventhal@sun.com int y = tgts[1]; 6452082Seschrock 64610105Sadam.leventhal@sun.com ASSERT(ntgts == 2); 6472082Seschrock ASSERT(x < y); 6482082Seschrock ASSERT(x >= rm->rm_firstdatacol); 6492082Seschrock ASSERT(y < rm->rm_cols); 6502082Seschrock 6512082Seschrock ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 6522082Seschrock 6532082Seschrock /* 6542082Seschrock * Move the parity data aside -- we're going to compute parity as 6552082Seschrock * though columns x and y were full of zeros -- Pxy and Qxy. We want to 6562082Seschrock * reuse the parity generation mechanism without trashing the actual 6572082Seschrock * parity so we make those columns appear to be full of zeros by 6582082Seschrock * setting their lengths to zero. 6592082Seschrock */ 6602082Seschrock pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 6612082Seschrock qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 6622082Seschrock xsize = rm->rm_col[x].rc_size; 6632082Seschrock ysize = rm->rm_col[y].rc_size; 6642082Seschrock 6652082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_data = 6662082Seschrock zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 6672082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_data = 6682082Seschrock zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 6692082Seschrock rm->rm_col[x].rc_size = 0; 6702082Seschrock rm->rm_col[y].rc_size = 0; 6712082Seschrock 6722082Seschrock vdev_raidz_generate_parity_pq(rm); 6732082Seschrock 6742082Seschrock rm->rm_col[x].rc_size = xsize; 6752082Seschrock rm->rm_col[y].rc_size = ysize; 6762082Seschrock 6772082Seschrock p = pdata; 6782082Seschrock q = qdata; 6792082Seschrock pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 6802082Seschrock qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 6812082Seschrock xd = rm->rm_col[x].rc_data; 6822082Seschrock yd = rm->rm_col[y].rc_data; 6832082Seschrock 6842082Seschrock /* 6852082Seschrock * We now have: 6862082Seschrock * Pxy = P + D_x + D_y 6872082Seschrock * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 6882082Seschrock * 6892082Seschrock * We can then solve for D_x: 6902082Seschrock * D_x = A * (P + Pxy) + B * (Q + Qxy) 6912082Seschrock * where 6922082Seschrock * A = 2^(x - y) * (2^(x - y) + 1)^-1 6932082Seschrock * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 6942082Seschrock * 6952082Seschrock * With D_x in hand, we can easily solve for D_y: 6962082Seschrock * D_y = P + Pxy + D_x 6972082Seschrock */ 6982082Seschrock 6992082Seschrock a = vdev_raidz_pow2[255 + x - y]; 7002082Seschrock b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 7012082Seschrock tmp = 255 - vdev_raidz_log2[a ^ 1]; 7022082Seschrock 7032082Seschrock aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 7042082Seschrock bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 7052082Seschrock 7062082Seschrock for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 7072082Seschrock *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 7082082Seschrock vdev_raidz_exp2(*q ^ *qxy, bexp); 7092082Seschrock 7102082Seschrock if (i < ysize) 7112082Seschrock *yd = *p ^ *pxy ^ *xd; 7122082Seschrock } 7132082Seschrock 7142082Seschrock zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 7152082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_size); 7162082Seschrock zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 7172082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_size); 7182082Seschrock 7192082Seschrock /* 7202082Seschrock * Restore the saved parity data. 7212082Seschrock */ 7222082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 7232082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 72410105Sadam.leventhal@sun.com 72510105Sadam.leventhal@sun.com return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 72610105Sadam.leventhal@sun.com } 72710105Sadam.leventhal@sun.com 72810105Sadam.leventhal@sun.com /* BEGIN CSTYLED */ 72910105Sadam.leventhal@sun.com /* 73010105Sadam.leventhal@sun.com * In the general case of reconstruction, we must solve the system of linear 73110105Sadam.leventhal@sun.com * equations defined by the coeffecients used to generate parity as well as 73210105Sadam.leventhal@sun.com * the contents of the data and parity disks. This can be expressed with 73310105Sadam.leventhal@sun.com * vectors for the original data (D) and the actual data (d) and parity (p) 73410105Sadam.leventhal@sun.com * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 73510105Sadam.leventhal@sun.com * 73610105Sadam.leventhal@sun.com * __ __ __ __ 73710105Sadam.leventhal@sun.com * | | __ __ | p_0 | 73810105Sadam.leventhal@sun.com * | V | | D_0 | | p_m-1 | 73910105Sadam.leventhal@sun.com * | | x | : | = | d_0 | 74010105Sadam.leventhal@sun.com * | I | | D_n-1 | | : | 74110105Sadam.leventhal@sun.com * | | ~~ ~~ | d_n-1 | 74210105Sadam.leventhal@sun.com * ~~ ~~ ~~ ~~ 74310105Sadam.leventhal@sun.com * 74410105Sadam.leventhal@sun.com * I is simply a square identity matrix of size n, and V is a vandermonde 74510105Sadam.leventhal@sun.com * matrix defined by the coeffecients we chose for the various parity columns 74610105Sadam.leventhal@sun.com * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 74710105Sadam.leventhal@sun.com * computation as well as linear separability. 74810105Sadam.leventhal@sun.com * 74910105Sadam.leventhal@sun.com * __ __ __ __ 75010105Sadam.leventhal@sun.com * | 1 .. 1 1 1 | | p_0 | 75110105Sadam.leventhal@sun.com * | 2^n-1 .. 4 2 1 | __ __ | : | 75210105Sadam.leventhal@sun.com * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 75310105Sadam.leventhal@sun.com * | 1 .. 0 0 0 | | D_1 | | d_0 | 75410105Sadam.leventhal@sun.com * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 75510105Sadam.leventhal@sun.com * | : : : : | | : | | d_2 | 75610105Sadam.leventhal@sun.com * | 0 .. 1 0 0 | | D_n-1 | | : | 75710105Sadam.leventhal@sun.com * | 0 .. 0 1 0 | ~~ ~~ | : | 75810105Sadam.leventhal@sun.com * | 0 .. 0 0 1 | | d_n-1 | 75910105Sadam.leventhal@sun.com * ~~ ~~ ~~ ~~ 76010105Sadam.leventhal@sun.com * 76110105Sadam.leventhal@sun.com * Note that I, V, d, and p are known. To compute D, we must invert the 76210105Sadam.leventhal@sun.com * matrix and use the known data and parity values to reconstruct the unknown 76310105Sadam.leventhal@sun.com * data values. We begin by removing the rows in V|I and d|p that correspond 76410105Sadam.leventhal@sun.com * to failed or missing columns; we then make V|I square (n x n) and d|p 76510105Sadam.leventhal@sun.com * sized n by removing rows corresponding to unused parity from the bottom up 76610105Sadam.leventhal@sun.com * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 76710105Sadam.leventhal@sun.com * using Gauss-Jordan elimination. In the example below we use m=3 parity 76810105Sadam.leventhal@sun.com * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 76910105Sadam.leventhal@sun.com * __ __ 77010105Sadam.leventhal@sun.com * | 1 1 1 1 1 1 1 1 | 77110105Sadam.leventhal@sun.com * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 77210105Sadam.leventhal@sun.com * | 19 205 116 29 64 16 4 1 | / / 77310105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 | / / 77410105Sadam.leventhal@sun.com * | 0 1 0 0 0 0 0 0 | <--' / 77510105Sadam.leventhal@sun.com * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 77610105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 | 77710105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 | 77810105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 | 77910105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 | 78010105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 | 78110105Sadam.leventhal@sun.com * ~~ ~~ 78210105Sadam.leventhal@sun.com * __ __ 78310105Sadam.leventhal@sun.com * | 1 1 1 1 1 1 1 1 | 78410105Sadam.leventhal@sun.com * | 128 64 32 16 8 4 2 1 | 78510105Sadam.leventhal@sun.com * | 19 205 116 29 64 16 4 1 | 78610105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 | 78710105Sadam.leventhal@sun.com * | 0 1 0 0 0 0 0 0 | 78810105Sadam.leventhal@sun.com * (V|I)' = | 0 0 1 0 0 0 0 0 | 78910105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 | 79010105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 | 79110105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 | 79210105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 | 79310105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 | 79410105Sadam.leventhal@sun.com * ~~ ~~ 79510105Sadam.leventhal@sun.com * 79610105Sadam.leventhal@sun.com * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 79710105Sadam.leventhal@sun.com * have carefully chosen the seed values 1, 2, and 4 to ensure that this 79810105Sadam.leventhal@sun.com * matrix is not singular. 79910105Sadam.leventhal@sun.com * __ __ 80010105Sadam.leventhal@sun.com * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 80110105Sadam.leventhal@sun.com * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 80210105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 80310105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 80410105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 80510105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 80610105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 80710105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 80810105Sadam.leventhal@sun.com * ~~ ~~ 80910105Sadam.leventhal@sun.com * __ __ 81010105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 81110105Sadam.leventhal@sun.com * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 81210105Sadam.leventhal@sun.com * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 81310105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 81410105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 81510105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 81610105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 81710105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 81810105Sadam.leventhal@sun.com * ~~ ~~ 81910105Sadam.leventhal@sun.com * __ __ 82010105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 82110105Sadam.leventhal@sun.com * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 82210105Sadam.leventhal@sun.com * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 82310105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 82410105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 82510105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 82610105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 82710105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 82810105Sadam.leventhal@sun.com * ~~ ~~ 82910105Sadam.leventhal@sun.com * __ __ 83010105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 83110105Sadam.leventhal@sun.com * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 83210105Sadam.leventhal@sun.com * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 83310105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 83410105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 83510105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 83610105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 83710105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 83810105Sadam.leventhal@sun.com * ~~ ~~ 83910105Sadam.leventhal@sun.com * __ __ 84010105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 84110105Sadam.leventhal@sun.com * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 84210105Sadam.leventhal@sun.com * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 84310105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 84410105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 84510105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 84610105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 84710105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 84810105Sadam.leventhal@sun.com * ~~ ~~ 84910105Sadam.leventhal@sun.com * __ __ 85010105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 85110105Sadam.leventhal@sun.com * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 85210105Sadam.leventhal@sun.com * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 85310105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 85410105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 85510105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 85610105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 85710105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 85810105Sadam.leventhal@sun.com * ~~ ~~ 85910105Sadam.leventhal@sun.com * __ __ 86010105Sadam.leventhal@sun.com * | 0 0 1 0 0 0 0 0 | 86110105Sadam.leventhal@sun.com * | 167 100 5 41 159 169 217 208 | 86210105Sadam.leventhal@sun.com * | 166 100 4 40 158 168 216 209 | 86310105Sadam.leventhal@sun.com * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 86410105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 | 86510105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 | 86610105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 | 86710105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 | 86810105Sadam.leventhal@sun.com * ~~ ~~ 86910105Sadam.leventhal@sun.com * 87010105Sadam.leventhal@sun.com * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 87110105Sadam.leventhal@sun.com * of the missing data. 87210105Sadam.leventhal@sun.com * 87310105Sadam.leventhal@sun.com * As is apparent from the example above, the only non-trivial rows in the 87410105Sadam.leventhal@sun.com * inverse matrix correspond to the data disks that we're trying to 87510105Sadam.leventhal@sun.com * reconstruct. Indeed, those are the only rows we need as the others would 87610105Sadam.leventhal@sun.com * only be useful for reconstructing data known or assumed to be valid. For 87710105Sadam.leventhal@sun.com * that reason, we only build the coefficients in the rows that correspond to 87810105Sadam.leventhal@sun.com * targeted columns. 87910105Sadam.leventhal@sun.com */ 88010105Sadam.leventhal@sun.com /* END CSTYLED */ 88110105Sadam.leventhal@sun.com 88210105Sadam.leventhal@sun.com static void 88310105Sadam.leventhal@sun.com vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 88410105Sadam.leventhal@sun.com uint8_t **rows) 88510105Sadam.leventhal@sun.com { 88610105Sadam.leventhal@sun.com int i, j; 88710105Sadam.leventhal@sun.com int pow; 88810105Sadam.leventhal@sun.com 88910105Sadam.leventhal@sun.com ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 89010105Sadam.leventhal@sun.com 89110105Sadam.leventhal@sun.com /* 89210105Sadam.leventhal@sun.com * Fill in the missing rows of interest. 89310105Sadam.leventhal@sun.com */ 89410105Sadam.leventhal@sun.com for (i = 0; i < nmap; i++) { 89510105Sadam.leventhal@sun.com ASSERT3S(0, <=, map[i]); 89610105Sadam.leventhal@sun.com ASSERT3S(map[i], <=, 2); 89710105Sadam.leventhal@sun.com 89810105Sadam.leventhal@sun.com pow = map[i] * n; 89910105Sadam.leventhal@sun.com if (pow > 255) 90010105Sadam.leventhal@sun.com pow -= 255; 90110105Sadam.leventhal@sun.com ASSERT(pow <= 255); 90210105Sadam.leventhal@sun.com 90310105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 90410105Sadam.leventhal@sun.com pow -= map[i]; 90510105Sadam.leventhal@sun.com if (pow < 0) 90610105Sadam.leventhal@sun.com pow += 255; 90710105Sadam.leventhal@sun.com rows[i][j] = vdev_raidz_pow2[pow]; 90810105Sadam.leventhal@sun.com } 90910105Sadam.leventhal@sun.com } 9102082Seschrock } 9112082Seschrock 91210105Sadam.leventhal@sun.com static void 91310105Sadam.leventhal@sun.com vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 91410105Sadam.leventhal@sun.com uint8_t **rows, uint8_t **invrows, const uint8_t *used) 91510105Sadam.leventhal@sun.com { 91610105Sadam.leventhal@sun.com int i, j, ii, jj; 91710105Sadam.leventhal@sun.com uint8_t log; 91810105Sadam.leventhal@sun.com 91910105Sadam.leventhal@sun.com /* 92010105Sadam.leventhal@sun.com * Assert that the first nmissing entries from the array of used 92110105Sadam.leventhal@sun.com * columns correspond to parity columns and that subsequent entries 92210105Sadam.leventhal@sun.com * correspond to data columns. 92310105Sadam.leventhal@sun.com */ 92410105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 92510105Sadam.leventhal@sun.com ASSERT3S(used[i], <, rm->rm_firstdatacol); 92610105Sadam.leventhal@sun.com } 92710105Sadam.leventhal@sun.com for (; i < n; i++) { 92810105Sadam.leventhal@sun.com ASSERT3S(used[i], >=, rm->rm_firstdatacol); 92910105Sadam.leventhal@sun.com } 93010105Sadam.leventhal@sun.com 93110105Sadam.leventhal@sun.com /* 93210105Sadam.leventhal@sun.com * First initialize the storage where we'll compute the inverse rows. 93310105Sadam.leventhal@sun.com */ 93410105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 93510105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 93610105Sadam.leventhal@sun.com invrows[i][j] = (i == j) ? 1 : 0; 93710105Sadam.leventhal@sun.com } 93810105Sadam.leventhal@sun.com } 93910105Sadam.leventhal@sun.com 94010105Sadam.leventhal@sun.com /* 94110105Sadam.leventhal@sun.com * Subtract all trivial rows from the rows of consequence. 94210105Sadam.leventhal@sun.com */ 94310105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 94410105Sadam.leventhal@sun.com for (j = nmissing; j < n; j++) { 94510105Sadam.leventhal@sun.com ASSERT3U(used[j], >=, rm->rm_firstdatacol); 94610105Sadam.leventhal@sun.com jj = used[j] - rm->rm_firstdatacol; 94710105Sadam.leventhal@sun.com ASSERT3S(jj, <, n); 94810105Sadam.leventhal@sun.com invrows[i][j] = rows[i][jj]; 94910105Sadam.leventhal@sun.com rows[i][jj] = 0; 95010105Sadam.leventhal@sun.com } 95110105Sadam.leventhal@sun.com } 95210105Sadam.leventhal@sun.com 95310105Sadam.leventhal@sun.com /* 95410105Sadam.leventhal@sun.com * For each of the rows of interest, we must normalize it and subtract 95510105Sadam.leventhal@sun.com * a multiple of it from the other rows. 95610105Sadam.leventhal@sun.com */ 95710105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 95810105Sadam.leventhal@sun.com for (j = 0; j < missing[i]; j++) { 95910105Sadam.leventhal@sun.com ASSERT3U(rows[i][j], ==, 0); 96010105Sadam.leventhal@sun.com } 96110105Sadam.leventhal@sun.com ASSERT3U(rows[i][missing[i]], !=, 0); 96210105Sadam.leventhal@sun.com 96310105Sadam.leventhal@sun.com /* 96410105Sadam.leventhal@sun.com * Compute the inverse of the first element and multiply each 96510105Sadam.leventhal@sun.com * element in the row by that value. 96610105Sadam.leventhal@sun.com */ 96710105Sadam.leventhal@sun.com log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 96810105Sadam.leventhal@sun.com 96910105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 97010105Sadam.leventhal@sun.com rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 97110105Sadam.leventhal@sun.com invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 97210105Sadam.leventhal@sun.com } 97310105Sadam.leventhal@sun.com 97410105Sadam.leventhal@sun.com for (ii = 0; ii < nmissing; ii++) { 97510105Sadam.leventhal@sun.com if (i == ii) 97610105Sadam.leventhal@sun.com continue; 97710105Sadam.leventhal@sun.com 97810105Sadam.leventhal@sun.com ASSERT3U(rows[ii][missing[i]], !=, 0); 97910105Sadam.leventhal@sun.com 98010105Sadam.leventhal@sun.com log = vdev_raidz_log2[rows[ii][missing[i]]]; 98110105Sadam.leventhal@sun.com 98210105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 98310105Sadam.leventhal@sun.com rows[ii][j] ^= 98410105Sadam.leventhal@sun.com vdev_raidz_exp2(rows[i][j], log); 98510105Sadam.leventhal@sun.com invrows[ii][j] ^= 98610105Sadam.leventhal@sun.com vdev_raidz_exp2(invrows[i][j], log); 98710105Sadam.leventhal@sun.com } 98810105Sadam.leventhal@sun.com } 98910105Sadam.leventhal@sun.com } 99010105Sadam.leventhal@sun.com 99110105Sadam.leventhal@sun.com /* 99210105Sadam.leventhal@sun.com * Verify that the data that is left in the rows are properly part of 99310105Sadam.leventhal@sun.com * an identity matrix. 99410105Sadam.leventhal@sun.com */ 99510105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 99610105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 99710105Sadam.leventhal@sun.com if (j == missing[i]) { 99810105Sadam.leventhal@sun.com ASSERT3U(rows[i][j], ==, 1); 99910105Sadam.leventhal@sun.com } else { 100010105Sadam.leventhal@sun.com ASSERT3U(rows[i][j], ==, 0); 100110105Sadam.leventhal@sun.com } 100210105Sadam.leventhal@sun.com } 100310105Sadam.leventhal@sun.com } 100410105Sadam.leventhal@sun.com } 100510105Sadam.leventhal@sun.com 100610105Sadam.leventhal@sun.com static void 100710105Sadam.leventhal@sun.com vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 100810105Sadam.leventhal@sun.com int *missing, uint8_t **invrows, const uint8_t *used) 100910105Sadam.leventhal@sun.com { 101010105Sadam.leventhal@sun.com int i, j, x, cc, c; 101110105Sadam.leventhal@sun.com uint8_t *src; 101210105Sadam.leventhal@sun.com uint64_t ccount; 101310105Sadam.leventhal@sun.com uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 101410105Sadam.leventhal@sun.com uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 101510105Sadam.leventhal@sun.com uint8_t log, val; 101610105Sadam.leventhal@sun.com int ll; 101710105Sadam.leventhal@sun.com uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 101810105Sadam.leventhal@sun.com uint8_t *p, *pp; 101910105Sadam.leventhal@sun.com size_t psize; 102010105Sadam.leventhal@sun.com 102110105Sadam.leventhal@sun.com psize = sizeof (invlog[0][0]) * n * nmissing; 102210105Sadam.leventhal@sun.com p = kmem_alloc(psize, KM_SLEEP); 102310105Sadam.leventhal@sun.com 102410105Sadam.leventhal@sun.com for (pp = p, i = 0; i < nmissing; i++) { 102510105Sadam.leventhal@sun.com invlog[i] = pp; 102610105Sadam.leventhal@sun.com pp += n; 102710105Sadam.leventhal@sun.com } 102810105Sadam.leventhal@sun.com 102910105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 103010105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 103110105Sadam.leventhal@sun.com ASSERT3U(invrows[i][j], !=, 0); 103210105Sadam.leventhal@sun.com invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 103310105Sadam.leventhal@sun.com } 103410105Sadam.leventhal@sun.com } 103510105Sadam.leventhal@sun.com 103610105Sadam.leventhal@sun.com for (i = 0; i < n; i++) { 103710105Sadam.leventhal@sun.com c = used[i]; 103810105Sadam.leventhal@sun.com ASSERT3U(c, <, rm->rm_cols); 103910105Sadam.leventhal@sun.com 104010105Sadam.leventhal@sun.com src = rm->rm_col[c].rc_data; 104110105Sadam.leventhal@sun.com ccount = rm->rm_col[c].rc_size; 104210105Sadam.leventhal@sun.com for (j = 0; j < nmissing; j++) { 104310105Sadam.leventhal@sun.com cc = missing[j] + rm->rm_firstdatacol; 104410105Sadam.leventhal@sun.com ASSERT3U(cc, >=, rm->rm_firstdatacol); 104510105Sadam.leventhal@sun.com ASSERT3U(cc, <, rm->rm_cols); 104610105Sadam.leventhal@sun.com ASSERT3U(cc, !=, c); 104710105Sadam.leventhal@sun.com 104810105Sadam.leventhal@sun.com dst[j] = rm->rm_col[cc].rc_data; 104910105Sadam.leventhal@sun.com dcount[j] = rm->rm_col[cc].rc_size; 105010105Sadam.leventhal@sun.com } 105110105Sadam.leventhal@sun.com 105210105Sadam.leventhal@sun.com ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 105310105Sadam.leventhal@sun.com 105410105Sadam.leventhal@sun.com for (x = 0; x < ccount; x++, src++) { 105510105Sadam.leventhal@sun.com if (*src != 0) 105610105Sadam.leventhal@sun.com log = vdev_raidz_log2[*src]; 105710105Sadam.leventhal@sun.com 105810105Sadam.leventhal@sun.com for (cc = 0; cc < nmissing; cc++) { 105910105Sadam.leventhal@sun.com if (x >= dcount[cc]) 106010105Sadam.leventhal@sun.com continue; 106110105Sadam.leventhal@sun.com 106210105Sadam.leventhal@sun.com if (*src == 0) { 106310105Sadam.leventhal@sun.com val = 0; 106410105Sadam.leventhal@sun.com } else { 106510105Sadam.leventhal@sun.com if ((ll = log + invlog[cc][i]) >= 255) 106610105Sadam.leventhal@sun.com ll -= 255; 106710105Sadam.leventhal@sun.com val = vdev_raidz_pow2[ll]; 106810105Sadam.leventhal@sun.com } 106910105Sadam.leventhal@sun.com 107010105Sadam.leventhal@sun.com if (i == 0) 107110105Sadam.leventhal@sun.com dst[cc][x] = val; 107210105Sadam.leventhal@sun.com else 107310105Sadam.leventhal@sun.com dst[cc][x] ^= val; 107410105Sadam.leventhal@sun.com } 107510105Sadam.leventhal@sun.com } 107610105Sadam.leventhal@sun.com } 107710105Sadam.leventhal@sun.com 107810105Sadam.leventhal@sun.com kmem_free(p, psize); 107910105Sadam.leventhal@sun.com } 108010105Sadam.leventhal@sun.com 108110105Sadam.leventhal@sun.com static int 108210105Sadam.leventhal@sun.com vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 108310105Sadam.leventhal@sun.com { 108410105Sadam.leventhal@sun.com int n, i, c, t, tt; 108510105Sadam.leventhal@sun.com int nmissing_rows; 108610105Sadam.leventhal@sun.com int missing_rows[VDEV_RAIDZ_MAXPARITY]; 108710105Sadam.leventhal@sun.com int parity_map[VDEV_RAIDZ_MAXPARITY]; 108810105Sadam.leventhal@sun.com 108910105Sadam.leventhal@sun.com uint8_t *p, *pp; 109010105Sadam.leventhal@sun.com size_t psize; 109110105Sadam.leventhal@sun.com 109210105Sadam.leventhal@sun.com uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 109310105Sadam.leventhal@sun.com uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 109410105Sadam.leventhal@sun.com uint8_t *used; 109510105Sadam.leventhal@sun.com 109610105Sadam.leventhal@sun.com int code = 0; 109710105Sadam.leventhal@sun.com 109810105Sadam.leventhal@sun.com 109910105Sadam.leventhal@sun.com n = rm->rm_cols - rm->rm_firstdatacol; 110010105Sadam.leventhal@sun.com 110110105Sadam.leventhal@sun.com /* 110210105Sadam.leventhal@sun.com * Figure out which data columns are missing. 110310105Sadam.leventhal@sun.com */ 110410105Sadam.leventhal@sun.com nmissing_rows = 0; 110510105Sadam.leventhal@sun.com for (t = 0; t < ntgts; t++) { 110610105Sadam.leventhal@sun.com if (tgts[t] >= rm->rm_firstdatacol) { 110710105Sadam.leventhal@sun.com missing_rows[nmissing_rows++] = 110810105Sadam.leventhal@sun.com tgts[t] - rm->rm_firstdatacol; 110910105Sadam.leventhal@sun.com } 111010105Sadam.leventhal@sun.com } 111110105Sadam.leventhal@sun.com 111210105Sadam.leventhal@sun.com /* 111310105Sadam.leventhal@sun.com * Figure out which parity columns to use to help generate the missing 111410105Sadam.leventhal@sun.com * data columns. 111510105Sadam.leventhal@sun.com */ 111610105Sadam.leventhal@sun.com for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 111710105Sadam.leventhal@sun.com ASSERT(tt < ntgts); 111810105Sadam.leventhal@sun.com ASSERT(c < rm->rm_firstdatacol); 111910105Sadam.leventhal@sun.com 112010105Sadam.leventhal@sun.com /* 112110105Sadam.leventhal@sun.com * Skip any targeted parity columns. 112210105Sadam.leventhal@sun.com */ 112310105Sadam.leventhal@sun.com if (c == tgts[tt]) { 112410105Sadam.leventhal@sun.com tt++; 112510105Sadam.leventhal@sun.com continue; 112610105Sadam.leventhal@sun.com } 112710105Sadam.leventhal@sun.com 112810105Sadam.leventhal@sun.com code |= 1 << c; 112910105Sadam.leventhal@sun.com 113010105Sadam.leventhal@sun.com parity_map[i] = c; 113110105Sadam.leventhal@sun.com i++; 113210105Sadam.leventhal@sun.com } 113310105Sadam.leventhal@sun.com 113410105Sadam.leventhal@sun.com ASSERT(code != 0); 113510105Sadam.leventhal@sun.com ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 113610105Sadam.leventhal@sun.com 113710105Sadam.leventhal@sun.com psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 113810105Sadam.leventhal@sun.com nmissing_rows * n + sizeof (used[0]) * n; 113910105Sadam.leventhal@sun.com p = kmem_alloc(psize, KM_SLEEP); 114010105Sadam.leventhal@sun.com 114110105Sadam.leventhal@sun.com for (pp = p, i = 0; i < nmissing_rows; i++) { 114210105Sadam.leventhal@sun.com rows[i] = pp; 114310105Sadam.leventhal@sun.com pp += n; 114410105Sadam.leventhal@sun.com invrows[i] = pp; 114510105Sadam.leventhal@sun.com pp += n; 114610105Sadam.leventhal@sun.com } 114710105Sadam.leventhal@sun.com used = pp; 114810105Sadam.leventhal@sun.com 114910105Sadam.leventhal@sun.com for (i = 0; i < nmissing_rows; i++) { 115010105Sadam.leventhal@sun.com used[i] = parity_map[i]; 115110105Sadam.leventhal@sun.com } 115210105Sadam.leventhal@sun.com 115310105Sadam.leventhal@sun.com for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 115410105Sadam.leventhal@sun.com if (tt < nmissing_rows && 115510105Sadam.leventhal@sun.com c == missing_rows[tt] + rm->rm_firstdatacol) { 115610105Sadam.leventhal@sun.com tt++; 115710105Sadam.leventhal@sun.com continue; 115810105Sadam.leventhal@sun.com } 115910105Sadam.leventhal@sun.com 116010105Sadam.leventhal@sun.com ASSERT3S(i, <, n); 116110105Sadam.leventhal@sun.com used[i] = c; 116210105Sadam.leventhal@sun.com i++; 116310105Sadam.leventhal@sun.com } 116410105Sadam.leventhal@sun.com 116510105Sadam.leventhal@sun.com /* 116610105Sadam.leventhal@sun.com * Initialize the interesting rows of the matrix. 116710105Sadam.leventhal@sun.com */ 116810105Sadam.leventhal@sun.com vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 116910105Sadam.leventhal@sun.com 117010105Sadam.leventhal@sun.com /* 117110105Sadam.leventhal@sun.com * Invert the matrix. 117210105Sadam.leventhal@sun.com */ 117310105Sadam.leventhal@sun.com vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 117410105Sadam.leventhal@sun.com invrows, used); 117510105Sadam.leventhal@sun.com 117610105Sadam.leventhal@sun.com /* 117710105Sadam.leventhal@sun.com * Reconstruct the missing data using the generated matrix. 117810105Sadam.leventhal@sun.com */ 117910105Sadam.leventhal@sun.com vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 118010105Sadam.leventhal@sun.com invrows, used); 118110105Sadam.leventhal@sun.com 118210105Sadam.leventhal@sun.com kmem_free(p, psize); 118310105Sadam.leventhal@sun.com 118410105Sadam.leventhal@sun.com return (code); 118510105Sadam.leventhal@sun.com } 118610105Sadam.leventhal@sun.com 118710105Sadam.leventhal@sun.com static int 118810105Sadam.leventhal@sun.com vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 118910105Sadam.leventhal@sun.com { 119010105Sadam.leventhal@sun.com int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 119110105Sadam.leventhal@sun.com int ntgts; 119210105Sadam.leventhal@sun.com int i, c; 119310105Sadam.leventhal@sun.com int code; 119410105Sadam.leventhal@sun.com int nbadparity, nbaddata; 119510105Sadam.leventhal@sun.com int parity_valid[VDEV_RAIDZ_MAXPARITY]; 119610105Sadam.leventhal@sun.com 119710105Sadam.leventhal@sun.com /* 119810105Sadam.leventhal@sun.com * The tgts list must already be sorted. 119910105Sadam.leventhal@sun.com */ 120010105Sadam.leventhal@sun.com for (i = 1; i < nt; i++) { 120110105Sadam.leventhal@sun.com ASSERT(t[i] > t[i - 1]); 120210105Sadam.leventhal@sun.com } 120310105Sadam.leventhal@sun.com 120410105Sadam.leventhal@sun.com nbadparity = rm->rm_firstdatacol; 120510105Sadam.leventhal@sun.com nbaddata = rm->rm_cols - nbadparity; 120610105Sadam.leventhal@sun.com ntgts = 0; 120710105Sadam.leventhal@sun.com for (i = 0, c = 0; c < rm->rm_cols; c++) { 120810105Sadam.leventhal@sun.com if (c < rm->rm_firstdatacol) 120910105Sadam.leventhal@sun.com parity_valid[c] = B_FALSE; 121010105Sadam.leventhal@sun.com 121110105Sadam.leventhal@sun.com if (i < nt && c == t[i]) { 121210105Sadam.leventhal@sun.com tgts[ntgts++] = c; 121310105Sadam.leventhal@sun.com i++; 121410105Sadam.leventhal@sun.com } else if (rm->rm_col[c].rc_error != 0) { 121510105Sadam.leventhal@sun.com tgts[ntgts++] = c; 121610105Sadam.leventhal@sun.com } else if (c >= rm->rm_firstdatacol) { 121710105Sadam.leventhal@sun.com nbaddata--; 121810105Sadam.leventhal@sun.com } else { 121910105Sadam.leventhal@sun.com parity_valid[c] = B_TRUE; 122010105Sadam.leventhal@sun.com nbadparity--; 122110105Sadam.leventhal@sun.com } 122210105Sadam.leventhal@sun.com } 122310105Sadam.leventhal@sun.com 122410105Sadam.leventhal@sun.com ASSERT(ntgts >= nt); 122510105Sadam.leventhal@sun.com ASSERT(nbaddata >= 0); 122610105Sadam.leventhal@sun.com ASSERT(nbaddata + nbadparity == ntgts); 122710105Sadam.leventhal@sun.com 122810105Sadam.leventhal@sun.com dt = &tgts[nbadparity]; 122910105Sadam.leventhal@sun.com 123010105Sadam.leventhal@sun.com /* 123110105Sadam.leventhal@sun.com * See if we can use any of our optimized reconstruction routines. 123210105Sadam.leventhal@sun.com */ 123310105Sadam.leventhal@sun.com if (!vdev_raidz_default_to_general) { 123410105Sadam.leventhal@sun.com switch (nbaddata) { 123510105Sadam.leventhal@sun.com case 1: 123610105Sadam.leventhal@sun.com if (parity_valid[VDEV_RAIDZ_P]) 123710105Sadam.leventhal@sun.com return (vdev_raidz_reconstruct_p(rm, dt, 1)); 123810105Sadam.leventhal@sun.com 123910105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol > 1); 124010105Sadam.leventhal@sun.com 124110105Sadam.leventhal@sun.com if (parity_valid[VDEV_RAIDZ_Q]) 124210105Sadam.leventhal@sun.com return (vdev_raidz_reconstruct_q(rm, dt, 1)); 124310105Sadam.leventhal@sun.com 124410105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol > 2); 124510105Sadam.leventhal@sun.com break; 124610105Sadam.leventhal@sun.com 124710105Sadam.leventhal@sun.com case 2: 124810105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol > 1); 124910105Sadam.leventhal@sun.com 125010105Sadam.leventhal@sun.com if (parity_valid[VDEV_RAIDZ_P] && 125110105Sadam.leventhal@sun.com parity_valid[VDEV_RAIDZ_Q]) 125210105Sadam.leventhal@sun.com return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 125310105Sadam.leventhal@sun.com 125410105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol > 2); 125510105Sadam.leventhal@sun.com 125610105Sadam.leventhal@sun.com break; 125710105Sadam.leventhal@sun.com } 125810105Sadam.leventhal@sun.com } 125910105Sadam.leventhal@sun.com 126010105Sadam.leventhal@sun.com code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 126110105Sadam.leventhal@sun.com ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 126210105Sadam.leventhal@sun.com ASSERT(code > 0); 126310105Sadam.leventhal@sun.com return (code); 126410105Sadam.leventhal@sun.com } 12652082Seschrock 1266789Sahrens static int 1267789Sahrens vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) 1268789Sahrens { 126910105Sadam.leventhal@sun.com vdev_t *cvd; 12702082Seschrock uint64_t nparity = vd->vdev_nparity; 127110105Sadam.leventhal@sun.com int c; 1272789Sahrens int lasterror = 0; 1273789Sahrens int numerrors = 0; 1274789Sahrens 12752082Seschrock ASSERT(nparity > 0); 12762082Seschrock 12772082Seschrock if (nparity > VDEV_RAIDZ_MAXPARITY || 12782082Seschrock vd->vdev_children < nparity + 1) { 1279789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1280789Sahrens return (EINVAL); 1281789Sahrens } 1282789Sahrens 12839846SEric.Taylor@Sun.COM vdev_open_children(vd); 1284789Sahrens 128510105Sadam.leventhal@sun.com for (c = 0; c < vd->vdev_children; c++) { 128610105Sadam.leventhal@sun.com cvd = vd->vdev_child[c]; 12879846SEric.Taylor@Sun.COM 128810105Sadam.leventhal@sun.com if (cvd->vdev_open_error != 0) { 12899846SEric.Taylor@Sun.COM lasterror = cvd->vdev_open_error; 1290789Sahrens numerrors++; 1291789Sahrens continue; 1292789Sahrens } 1293789Sahrens 1294789Sahrens *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 12951732Sbonwick *ashift = MAX(*ashift, cvd->vdev_ashift); 1296789Sahrens } 1297789Sahrens 1298789Sahrens *asize *= vd->vdev_children; 1299789Sahrens 13002082Seschrock if (numerrors > nparity) { 1301789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1302789Sahrens return (lasterror); 1303789Sahrens } 1304789Sahrens 1305789Sahrens return (0); 1306789Sahrens } 1307789Sahrens 1308789Sahrens static void 1309789Sahrens vdev_raidz_close(vdev_t *vd) 1310789Sahrens { 131110105Sadam.leventhal@sun.com int c; 131210105Sadam.leventhal@sun.com 131310105Sadam.leventhal@sun.com for (c = 0; c < vd->vdev_children; c++) 1314789Sahrens vdev_close(vd->vdev_child[c]); 1315789Sahrens } 1316789Sahrens 1317789Sahrens static uint64_t 1318789Sahrens vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1319789Sahrens { 1320789Sahrens uint64_t asize; 13211732Sbonwick uint64_t ashift = vd->vdev_top->vdev_ashift; 1322789Sahrens uint64_t cols = vd->vdev_children; 13232082Seschrock uint64_t nparity = vd->vdev_nparity; 1324789Sahrens 13251732Sbonwick asize = ((psize - 1) >> ashift) + 1; 13262082Seschrock asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 13272082Seschrock asize = roundup(asize, nparity + 1) << ashift; 1328789Sahrens 1329789Sahrens return (asize); 1330789Sahrens } 1331789Sahrens 1332789Sahrens static void 1333789Sahrens vdev_raidz_child_done(zio_t *zio) 1334789Sahrens { 1335789Sahrens raidz_col_t *rc = zio->io_private; 1336789Sahrens 1337789Sahrens rc->rc_error = zio->io_error; 1338789Sahrens rc->rc_tried = 1; 1339789Sahrens rc->rc_skipped = 0; 1340789Sahrens } 1341789Sahrens 13425530Sbonwick static int 1343789Sahrens vdev_raidz_io_start(zio_t *zio) 1344789Sahrens { 1345789Sahrens vdev_t *vd = zio->io_vd; 13461732Sbonwick vdev_t *tvd = vd->vdev_top; 1347789Sahrens vdev_t *cvd; 1348789Sahrens blkptr_t *bp = zio->io_bp; 1349789Sahrens raidz_map_t *rm; 1350789Sahrens raidz_col_t *rc; 135110105Sadam.leventhal@sun.com int c, i; 1352789Sahrens 13532082Seschrock rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, 13542082Seschrock vd->vdev_nparity); 1355789Sahrens 13561775Sbillm ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1357789Sahrens 1358789Sahrens if (zio->io_type == ZIO_TYPE_WRITE) { 135910105Sadam.leventhal@sun.com vdev_raidz_generate_parity(rm); 1360789Sahrens 1361789Sahrens for (c = 0; c < rm->rm_cols; c++) { 1362789Sahrens rc = &rm->rm_col[c]; 13632082Seschrock cvd = vd->vdev_child[rc->rc_devidx]; 1364789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1365789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 13667754SJeff.Bonwick@Sun.COM zio->io_type, zio->io_priority, 0, 1367789Sahrens vdev_raidz_child_done, rc)); 1368789Sahrens } 13695530Sbonwick 137010105Sadam.leventhal@sun.com /* 137110105Sadam.leventhal@sun.com * Generate optional I/Os for any skipped sectors to improve 137210105Sadam.leventhal@sun.com * aggregation contiguity. 137310105Sadam.leventhal@sun.com */ 1374*10450Sadam.leventhal@sun.com for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 137510105Sadam.leventhal@sun.com ASSERT(c <= rm->rm_scols); 137610105Sadam.leventhal@sun.com if (c == rm->rm_scols) 137710105Sadam.leventhal@sun.com c = 0; 137810105Sadam.leventhal@sun.com rc = &rm->rm_col[c]; 137910105Sadam.leventhal@sun.com cvd = vd->vdev_child[rc->rc_devidx]; 138010105Sadam.leventhal@sun.com zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 138110105Sadam.leventhal@sun.com rc->rc_offset + rc->rc_size, NULL, 138210105Sadam.leventhal@sun.com 1 << tvd->vdev_ashift, 138310105Sadam.leventhal@sun.com zio->io_type, zio->io_priority, 138410105Sadam.leventhal@sun.com ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 138510105Sadam.leventhal@sun.com } 138610105Sadam.leventhal@sun.com 13877754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 1388789Sahrens } 1389789Sahrens 1390789Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ); 1391789Sahrens 13922082Seschrock /* 13932082Seschrock * Iterate over the columns in reverse order so that we hit the parity 139410105Sadam.leventhal@sun.com * last -- any errors along the way will force us to read the parity. 13952082Seschrock */ 1396789Sahrens for (c = rm->rm_cols - 1; c >= 0; c--) { 1397789Sahrens rc = &rm->rm_col[c]; 13982082Seschrock cvd = vd->vdev_child[rc->rc_devidx]; 13995329Sgw25295 if (!vdev_readable(cvd)) { 14002082Seschrock if (c >= rm->rm_firstdatacol) 14012082Seschrock rm->rm_missingdata++; 14022082Seschrock else 14032082Seschrock rm->rm_missingparity++; 1404789Sahrens rc->rc_error = ENXIO; 1405789Sahrens rc->rc_tried = 1; /* don't even try */ 1406789Sahrens rc->rc_skipped = 1; 1407789Sahrens continue; 1408789Sahrens } 14098241SJeff.Bonwick@Sun.COM if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) { 14102082Seschrock if (c >= rm->rm_firstdatacol) 14112082Seschrock rm->rm_missingdata++; 14122082Seschrock else 14132082Seschrock rm->rm_missingparity++; 1414789Sahrens rc->rc_error = ESTALE; 1415789Sahrens rc->rc_skipped = 1; 1416789Sahrens continue; 1417789Sahrens } 14182082Seschrock if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 14199434SMark.Musante@Sun.COM (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1420789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1421789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 14227754SJeff.Bonwick@Sun.COM zio->io_type, zio->io_priority, 0, 1423789Sahrens vdev_raidz_child_done, rc)); 1424789Sahrens } 1425789Sahrens } 1426789Sahrens 14277754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 1428789Sahrens } 1429789Sahrens 14301544Seschrock /* 14311544Seschrock * Report a checksum error for a child of a RAID-Z device. 14321544Seschrock */ 14331544Seschrock static void 14341544Seschrock raidz_checksum_error(zio_t *zio, raidz_col_t *rc) 14351544Seschrock { 14362082Seschrock vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 14371544Seschrock 14381544Seschrock if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 14391544Seschrock mutex_enter(&vd->vdev_stat_lock); 14401544Seschrock vd->vdev_stat.vs_checksum_errors++; 14411544Seschrock mutex_exit(&vd->vdev_stat_lock); 14421544Seschrock } 14431544Seschrock 14441544Seschrock if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 14451544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 14461544Seschrock zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size); 14471544Seschrock } 14481544Seschrock 14492082Seschrock /* 14502082Seschrock * Generate the parity from the data columns. If we tried and were able to 14512082Seschrock * read the parity without error, verify that the generated parity matches the 14522082Seschrock * data we read. If it doesn't, we fire off a checksum error. Return the 14532082Seschrock * number such failures. 14542082Seschrock */ 14552082Seschrock static int 14562082Seschrock raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 14572082Seschrock { 14582082Seschrock void *orig[VDEV_RAIDZ_MAXPARITY]; 14592082Seschrock int c, ret = 0; 14602082Seschrock raidz_col_t *rc; 14612082Seschrock 14622082Seschrock for (c = 0; c < rm->rm_firstdatacol; c++) { 14632082Seschrock rc = &rm->rm_col[c]; 14642082Seschrock if (!rc->rc_tried || rc->rc_error != 0) 14652082Seschrock continue; 14662082Seschrock orig[c] = zio_buf_alloc(rc->rc_size); 14672082Seschrock bcopy(rc->rc_data, orig[c], rc->rc_size); 14682082Seschrock } 14692082Seschrock 147010105Sadam.leventhal@sun.com vdev_raidz_generate_parity(rm); 14712082Seschrock 14722082Seschrock for (c = 0; c < rm->rm_firstdatacol; c++) { 14732082Seschrock rc = &rm->rm_col[c]; 14742082Seschrock if (!rc->rc_tried || rc->rc_error != 0) 14752082Seschrock continue; 14762082Seschrock if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 14772082Seschrock raidz_checksum_error(zio, rc); 14782082Seschrock rc->rc_error = ECKSUM; 14792082Seschrock ret++; 14802082Seschrock } 14812082Seschrock zio_buf_free(orig[c], rc->rc_size); 14822082Seschrock } 14832082Seschrock 14842082Seschrock return (ret); 14852082Seschrock } 14862082Seschrock 148710105Sadam.leventhal@sun.com /* 148810105Sadam.leventhal@sun.com * Keep statistics on all the ways that we used parity to correct data. 148910105Sadam.leventhal@sun.com */ 149010105Sadam.leventhal@sun.com static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; 14911544Seschrock 14925530Sbonwick static int 14937754SJeff.Bonwick@Sun.COM vdev_raidz_worst_error(raidz_map_t *rm) 14947754SJeff.Bonwick@Sun.COM { 14957754SJeff.Bonwick@Sun.COM int error = 0; 14967754SJeff.Bonwick@Sun.COM 14977754SJeff.Bonwick@Sun.COM for (int c = 0; c < rm->rm_cols; c++) 14987754SJeff.Bonwick@Sun.COM error = zio_worst_error(error, rm->rm_col[c].rc_error); 14997754SJeff.Bonwick@Sun.COM 15007754SJeff.Bonwick@Sun.COM return (error); 15017754SJeff.Bonwick@Sun.COM } 15027754SJeff.Bonwick@Sun.COM 150310105Sadam.leventhal@sun.com /* 150410105Sadam.leventhal@sun.com * Iterate over all combinations of bad data and attempt a reconstruction. 150510105Sadam.leventhal@sun.com * Note that the algorithm below is non-optimal because it doesn't take into 150610105Sadam.leventhal@sun.com * account how reconstruction is actually performed. For example, with 150710105Sadam.leventhal@sun.com * triple-parity RAID-Z the reconstruction procedure is the same if column 4 150810105Sadam.leventhal@sun.com * is targeted as invalid as if columns 1 and 4 are targeted since in both 150910105Sadam.leventhal@sun.com * cases we'd only use parity information in column 0. 151010105Sadam.leventhal@sun.com */ 151110105Sadam.leventhal@sun.com static int 151210105Sadam.leventhal@sun.com vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 151310105Sadam.leventhal@sun.com { 151410105Sadam.leventhal@sun.com raidz_map_t *rm = zio->io_vsd; 151510105Sadam.leventhal@sun.com raidz_col_t *rc; 151610105Sadam.leventhal@sun.com void *orig[VDEV_RAIDZ_MAXPARITY]; 151710105Sadam.leventhal@sun.com int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 151810105Sadam.leventhal@sun.com int *tgts = &tstore[1]; 151910105Sadam.leventhal@sun.com int current, next, i, c, n; 152010105Sadam.leventhal@sun.com int code, ret = 0; 152110105Sadam.leventhal@sun.com 152210105Sadam.leventhal@sun.com ASSERT(total_errors < rm->rm_firstdatacol); 152310105Sadam.leventhal@sun.com 152410105Sadam.leventhal@sun.com /* 152510105Sadam.leventhal@sun.com * This simplifies one edge condition. 152610105Sadam.leventhal@sun.com */ 152710105Sadam.leventhal@sun.com tgts[-1] = -1; 152810105Sadam.leventhal@sun.com 152910105Sadam.leventhal@sun.com for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 153010105Sadam.leventhal@sun.com /* 153110105Sadam.leventhal@sun.com * Initialize the targets array by finding the first n columns 153210105Sadam.leventhal@sun.com * that contain no error. 153310105Sadam.leventhal@sun.com * 153410105Sadam.leventhal@sun.com * If there were no data errors, we need to ensure that we're 153510105Sadam.leventhal@sun.com * always explicitly attempting to reconstruct at least one 153610105Sadam.leventhal@sun.com * data column. To do this, we simply push the highest target 153710105Sadam.leventhal@sun.com * up into the data columns. 153810105Sadam.leventhal@sun.com */ 153910105Sadam.leventhal@sun.com for (c = 0, i = 0; i < n; i++) { 154010105Sadam.leventhal@sun.com if (i == n - 1 && data_errors == 0 && 154110105Sadam.leventhal@sun.com c < rm->rm_firstdatacol) { 154210105Sadam.leventhal@sun.com c = rm->rm_firstdatacol; 154310105Sadam.leventhal@sun.com } 154410105Sadam.leventhal@sun.com 154510105Sadam.leventhal@sun.com while (rm->rm_col[c].rc_error != 0) { 154610105Sadam.leventhal@sun.com c++; 154710105Sadam.leventhal@sun.com ASSERT3S(c, <, rm->rm_cols); 154810105Sadam.leventhal@sun.com } 154910105Sadam.leventhal@sun.com 155010105Sadam.leventhal@sun.com tgts[i] = c++; 155110105Sadam.leventhal@sun.com } 155210105Sadam.leventhal@sun.com 155310105Sadam.leventhal@sun.com /* 155410105Sadam.leventhal@sun.com * Setting tgts[n] simplifies the other edge condition. 155510105Sadam.leventhal@sun.com */ 155610105Sadam.leventhal@sun.com tgts[n] = rm->rm_cols; 155710105Sadam.leventhal@sun.com 155810105Sadam.leventhal@sun.com /* 155910105Sadam.leventhal@sun.com * These buffers were allocated in previous iterations. 156010105Sadam.leventhal@sun.com */ 156110105Sadam.leventhal@sun.com for (i = 0; i < n - 1; i++) { 156210105Sadam.leventhal@sun.com ASSERT(orig[i] != NULL); 156310105Sadam.leventhal@sun.com } 156410105Sadam.leventhal@sun.com 156510105Sadam.leventhal@sun.com orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); 156610105Sadam.leventhal@sun.com 156710105Sadam.leventhal@sun.com current = 0; 156810105Sadam.leventhal@sun.com next = tgts[current]; 156910105Sadam.leventhal@sun.com 157010105Sadam.leventhal@sun.com while (current != n) { 157110105Sadam.leventhal@sun.com tgts[current] = next; 157210105Sadam.leventhal@sun.com current = 0; 157310105Sadam.leventhal@sun.com 157410105Sadam.leventhal@sun.com /* 157510105Sadam.leventhal@sun.com * Save off the original data that we're going to 157610105Sadam.leventhal@sun.com * attempt to reconstruct. 157710105Sadam.leventhal@sun.com */ 157810105Sadam.leventhal@sun.com for (i = 0; i < n; i++) { 157910105Sadam.leventhal@sun.com ASSERT(orig[i] != NULL); 158010105Sadam.leventhal@sun.com c = tgts[i]; 158110105Sadam.leventhal@sun.com ASSERT3S(c, >=, 0); 158210105Sadam.leventhal@sun.com ASSERT3S(c, <, rm->rm_cols); 158310105Sadam.leventhal@sun.com rc = &rm->rm_col[c]; 158410105Sadam.leventhal@sun.com bcopy(rc->rc_data, orig[i], rc->rc_size); 158510105Sadam.leventhal@sun.com } 158610105Sadam.leventhal@sun.com 158710105Sadam.leventhal@sun.com /* 158810105Sadam.leventhal@sun.com * Attempt a reconstruction and exit the outer loop on 158910105Sadam.leventhal@sun.com * success. 159010105Sadam.leventhal@sun.com */ 159110105Sadam.leventhal@sun.com code = vdev_raidz_reconstruct(rm, tgts, n); 159210105Sadam.leventhal@sun.com if (zio_checksum_error(zio) == 0) { 159310105Sadam.leventhal@sun.com atomic_inc_64(&raidz_corrected[code]); 159410105Sadam.leventhal@sun.com 159510105Sadam.leventhal@sun.com for (i = 0; i < n; i++) { 159610105Sadam.leventhal@sun.com c = tgts[i]; 159710105Sadam.leventhal@sun.com rc = &rm->rm_col[c]; 159810105Sadam.leventhal@sun.com ASSERT(rc->rc_error == 0); 1599*10450Sadam.leventhal@sun.com if (rc->rc_tried) { 1600*10450Sadam.leventhal@sun.com if (bcmp(orig[i], rc->rc_data, 1601*10450Sadam.leventhal@sun.com rc->rc_size) == 0) 1602*10450Sadam.leventhal@sun.com continue; 160310105Sadam.leventhal@sun.com raidz_checksum_error(zio, rc); 1604*10450Sadam.leventhal@sun.com } 160510105Sadam.leventhal@sun.com rc->rc_error = ECKSUM; 160610105Sadam.leventhal@sun.com } 160710105Sadam.leventhal@sun.com 160810105Sadam.leventhal@sun.com ret = code; 160910105Sadam.leventhal@sun.com goto done; 161010105Sadam.leventhal@sun.com } 161110105Sadam.leventhal@sun.com 161210105Sadam.leventhal@sun.com /* 161310105Sadam.leventhal@sun.com * Restore the original data. 161410105Sadam.leventhal@sun.com */ 161510105Sadam.leventhal@sun.com for (i = 0; i < n; i++) { 161610105Sadam.leventhal@sun.com c = tgts[i]; 161710105Sadam.leventhal@sun.com rc = &rm->rm_col[c]; 161810105Sadam.leventhal@sun.com bcopy(orig[i], rc->rc_data, rc->rc_size); 161910105Sadam.leventhal@sun.com } 162010105Sadam.leventhal@sun.com 162110105Sadam.leventhal@sun.com do { 162210105Sadam.leventhal@sun.com /* 162310105Sadam.leventhal@sun.com * Find the next valid column after the current 162410105Sadam.leventhal@sun.com * position.. 162510105Sadam.leventhal@sun.com */ 162610105Sadam.leventhal@sun.com for (next = tgts[current] + 1; 162710105Sadam.leventhal@sun.com next < rm->rm_cols && 162810105Sadam.leventhal@sun.com rm->rm_col[next].rc_error != 0; next++) 162910105Sadam.leventhal@sun.com continue; 163010105Sadam.leventhal@sun.com 163110105Sadam.leventhal@sun.com ASSERT(next <= tgts[current + 1]); 163210105Sadam.leventhal@sun.com 163310105Sadam.leventhal@sun.com /* 163410105Sadam.leventhal@sun.com * If that spot is available, we're done here. 163510105Sadam.leventhal@sun.com */ 163610105Sadam.leventhal@sun.com if (next != tgts[current + 1]) 163710105Sadam.leventhal@sun.com break; 163810105Sadam.leventhal@sun.com 163910105Sadam.leventhal@sun.com /* 164010105Sadam.leventhal@sun.com * Otherwise, find the next valid column after 164110105Sadam.leventhal@sun.com * the previous position. 164210105Sadam.leventhal@sun.com */ 164310105Sadam.leventhal@sun.com for (c = tgts[current - 1] + 1; 164410105Sadam.leventhal@sun.com rm->rm_col[c].rc_error != 0; c++) 164510105Sadam.leventhal@sun.com continue; 164610105Sadam.leventhal@sun.com 164710105Sadam.leventhal@sun.com tgts[current] = c; 164810105Sadam.leventhal@sun.com current++; 164910105Sadam.leventhal@sun.com 165010105Sadam.leventhal@sun.com } while (current != n); 165110105Sadam.leventhal@sun.com } 165210105Sadam.leventhal@sun.com } 165310105Sadam.leventhal@sun.com n--; 165410105Sadam.leventhal@sun.com done: 165510105Sadam.leventhal@sun.com for (i = 0; i < n; i++) { 165610105Sadam.leventhal@sun.com zio_buf_free(orig[i], rm->rm_col[0].rc_size); 165710105Sadam.leventhal@sun.com } 165810105Sadam.leventhal@sun.com 165910105Sadam.leventhal@sun.com return (ret); 166010105Sadam.leventhal@sun.com } 166110105Sadam.leventhal@sun.com 16627754SJeff.Bonwick@Sun.COM static void 1663789Sahrens vdev_raidz_io_done(zio_t *zio) 1664789Sahrens { 1665789Sahrens vdev_t *vd = zio->io_vd; 1666789Sahrens vdev_t *cvd; 1667789Sahrens raidz_map_t *rm = zio->io_vsd; 166810105Sadam.leventhal@sun.com raidz_col_t *rc; 1669789Sahrens int unexpected_errors = 0; 16702082Seschrock int parity_errors = 0; 16713456Sahl int parity_untried = 0; 16722082Seschrock int data_errors = 0; 16737754SJeff.Bonwick@Sun.COM int total_errors = 0; 167410105Sadam.leventhal@sun.com int n, c; 167510105Sadam.leventhal@sun.com int tgts[VDEV_RAIDZ_MAXPARITY]; 167610105Sadam.leventhal@sun.com int code; 1677789Sahrens 16781775Sbillm ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 1679789Sahrens 16802082Seschrock ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 16812082Seschrock ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 16822082Seschrock 1683789Sahrens for (c = 0; c < rm->rm_cols; c++) { 1684789Sahrens rc = &rm->rm_col[c]; 1685789Sahrens 1686789Sahrens if (rc->rc_error) { 16877754SJeff.Bonwick@Sun.COM ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 16882082Seschrock 16892082Seschrock if (c < rm->rm_firstdatacol) 16902082Seschrock parity_errors++; 16912082Seschrock else 16922082Seschrock data_errors++; 16932082Seschrock 1694789Sahrens if (!rc->rc_skipped) 1695789Sahrens unexpected_errors++; 16962082Seschrock 16977754SJeff.Bonwick@Sun.COM total_errors++; 16983456Sahl } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 16993456Sahl parity_untried++; 1700789Sahrens } 1701789Sahrens } 1702789Sahrens 1703789Sahrens if (zio->io_type == ZIO_TYPE_WRITE) { 1704789Sahrens /* 17057754SJeff.Bonwick@Sun.COM * XXX -- for now, treat partial writes as a success. 17067754SJeff.Bonwick@Sun.COM * (If we couldn't write enough columns to reconstruct 17077754SJeff.Bonwick@Sun.COM * the data, the I/O failed. Otherwise, good enough.) 17087754SJeff.Bonwick@Sun.COM * 17097754SJeff.Bonwick@Sun.COM * Now that we support write reallocation, it would be better 17107754SJeff.Bonwick@Sun.COM * to treat partial failure as real failure unless there are 17117754SJeff.Bonwick@Sun.COM * no non-degraded top-level vdevs left, and not update DTLs 17127754SJeff.Bonwick@Sun.COM * if we intend to reallocate. 1713789Sahrens */ 1714789Sahrens /* XXPOLICY */ 17157754SJeff.Bonwick@Sun.COM if (total_errors > rm->rm_firstdatacol) 17167754SJeff.Bonwick@Sun.COM zio->io_error = vdev_raidz_worst_error(rm); 1717789Sahrens 17187754SJeff.Bonwick@Sun.COM return; 1719789Sahrens } 1720789Sahrens 1721789Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ); 17222082Seschrock /* 17232082Seschrock * There are three potential phases for a read: 17242082Seschrock * 1. produce valid data from the columns read 17252082Seschrock * 2. read all disks and try again 17262082Seschrock * 3. perform combinatorial reconstruction 17272082Seschrock * 17282082Seschrock * Each phase is progressively both more expensive and less likely to 17292082Seschrock * occur. If we encounter more errors than we can repair or all phases 17302082Seschrock * fail, we have no choice but to return an error. 17312082Seschrock */ 1732789Sahrens 1733789Sahrens /* 17342082Seschrock * If the number of errors we saw was correctable -- less than or equal 17353456Sahl * to the number of parity disks read -- attempt to produce data that 17363456Sahl * has a valid checksum. Naturally, this case applies in the absence of 17373456Sahl * any errors. 1738789Sahrens */ 17397754SJeff.Bonwick@Sun.COM if (total_errors <= rm->rm_firstdatacol - parity_untried) { 174010105Sadam.leventhal@sun.com if (data_errors == 0) { 17412082Seschrock if (zio_checksum_error(zio) == 0) { 17424034Sahl /* 17434034Sahl * If we read parity information (unnecessarily 17444034Sahl * as it happens since no reconstruction was 17454034Sahl * needed) regenerate and verify the parity. 17464034Sahl * We also regenerate parity when resilvering 17474034Sahl * so we can write it out to the failed device 17484034Sahl * later. 17494034Sahl */ 17503456Sahl if (parity_errors + parity_untried < 17514034Sahl rm->rm_firstdatacol || 17524034Sahl (zio->io_flags & ZIO_FLAG_RESILVER)) { 17533456Sahl n = raidz_parity_verify(zio, rm); 17543456Sahl unexpected_errors += n; 17553456Sahl ASSERT(parity_errors + n <= 17563456Sahl rm->rm_firstdatacol); 17573456Sahl } 17582082Seschrock goto done; 17592082Seschrock } 176010105Sadam.leventhal@sun.com } else { 17613456Sahl /* 17623456Sahl * We either attempt to read all the parity columns or 17633456Sahl * none of them. If we didn't try to read parity, we 17643456Sahl * wouldn't be here in the correctable case. There must 17653456Sahl * also have been fewer parity errors than parity 17663456Sahl * columns or, again, we wouldn't be in this code path. 17673456Sahl */ 17683456Sahl ASSERT(parity_untried == 0); 17692082Seschrock ASSERT(parity_errors < rm->rm_firstdatacol); 17702082Seschrock 17712082Seschrock /* 177210105Sadam.leventhal@sun.com * Identify the data columns that reported an error. 17732082Seschrock */ 177410105Sadam.leventhal@sun.com n = 0; 17752082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 17762082Seschrock rc = &rm->rm_col[c]; 177710105Sadam.leventhal@sun.com if (rc->rc_error != 0) { 177810105Sadam.leventhal@sun.com ASSERT(n < VDEV_RAIDZ_MAXPARITY); 177910105Sadam.leventhal@sun.com tgts[n++] = c; 178010105Sadam.leventhal@sun.com } 17812082Seschrock } 17822082Seschrock 178310105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol >= n); 178410105Sadam.leventhal@sun.com 178510105Sadam.leventhal@sun.com code = vdev_raidz_reconstruct(rm, tgts, n); 17862082Seschrock 17872082Seschrock if (zio_checksum_error(zio) == 0) { 178810105Sadam.leventhal@sun.com atomic_inc_64(&raidz_corrected[code]); 1789789Sahrens 17902082Seschrock /* 179110105Sadam.leventhal@sun.com * If we read more parity disks than were used 179210105Sadam.leventhal@sun.com * for reconstruction, confirm that the other 179310105Sadam.leventhal@sun.com * parity disks produced correct data. This 179410105Sadam.leventhal@sun.com * routine is suboptimal in that it regenerates 179510105Sadam.leventhal@sun.com * the parity that we already used in addition 179610105Sadam.leventhal@sun.com * to the parity that we're attempting to 179710105Sadam.leventhal@sun.com * verify, but this should be a relatively 179810105Sadam.leventhal@sun.com * uncommon case, and can be optimized if it 179910105Sadam.leventhal@sun.com * becomes a problem. Note that we regenerate 180010105Sadam.leventhal@sun.com * parity when resilvering so we can write it 180110105Sadam.leventhal@sun.com * out to failed devices later. 18022082Seschrock */ 180310105Sadam.leventhal@sun.com if (parity_errors < rm->rm_firstdatacol - n || 18044034Sahl (zio->io_flags & ZIO_FLAG_RESILVER)) { 18052082Seschrock n = raidz_parity_verify(zio, rm); 18062082Seschrock unexpected_errors += n; 18072082Seschrock ASSERT(parity_errors + n <= 18082082Seschrock rm->rm_firstdatacol); 18092082Seschrock } 18102082Seschrock 18112082Seschrock goto done; 18122082Seschrock } 1813789Sahrens } 1814789Sahrens } 1815789Sahrens 1816789Sahrens /* 18172082Seschrock * This isn't a typical situation -- either we got a read error or 18182082Seschrock * a child silently returned bad data. Read every block so we can 18192082Seschrock * try again with as much data and parity as we can track down. If 18202082Seschrock * we've already been through once before, all children will be marked 18212082Seschrock * as tried so we'll proceed to combinatorial reconstruction. 1822789Sahrens */ 1823789Sahrens unexpected_errors = 1; 18242082Seschrock rm->rm_missingdata = 0; 18252082Seschrock rm->rm_missingparity = 0; 1826789Sahrens 18272082Seschrock for (c = 0; c < rm->rm_cols; c++) { 18282082Seschrock if (rm->rm_col[c].rc_tried) 18292082Seschrock continue; 1830789Sahrens 1831789Sahrens zio_vdev_io_redone(zio); 18322082Seschrock do { 1833789Sahrens rc = &rm->rm_col[c]; 1834789Sahrens if (rc->rc_tried) 1835789Sahrens continue; 1836789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, 18372082Seschrock vd->vdev_child[rc->rc_devidx], 1838789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 18397754SJeff.Bonwick@Sun.COM zio->io_type, zio->io_priority, 0, 1840789Sahrens vdev_raidz_child_done, rc)); 18412082Seschrock } while (++c < rm->rm_cols); 18425530Sbonwick 18437754SJeff.Bonwick@Sun.COM return; 1844789Sahrens } 1845789Sahrens 1846789Sahrens /* 18472082Seschrock * At this point we've attempted to reconstruct the data given the 18482082Seschrock * errors we detected, and we've attempted to read all columns. There 18492082Seschrock * must, therefore, be one or more additional problems -- silent errors 18502082Seschrock * resulting in invalid data rather than explicit I/O errors resulting 185110105Sadam.leventhal@sun.com * in absent data. We check if there is enough additional data to 185210105Sadam.leventhal@sun.com * possibly reconstruct the data and then perform combinatorial 185310105Sadam.leventhal@sun.com * reconstruction over all possible combinations. If that fails, 185410105Sadam.leventhal@sun.com * we're cooked. 1855789Sahrens */ 18567754SJeff.Bonwick@Sun.COM if (total_errors >= rm->rm_firstdatacol) { 18577754SJeff.Bonwick@Sun.COM zio->io_error = vdev_raidz_worst_error(rm); 18587754SJeff.Bonwick@Sun.COM /* 18597754SJeff.Bonwick@Sun.COM * If there were exactly as many device errors as parity 18607754SJeff.Bonwick@Sun.COM * columns, yet we couldn't reconstruct the data, then at 18617754SJeff.Bonwick@Sun.COM * least one device must have returned bad data silently. 18627754SJeff.Bonwick@Sun.COM */ 18637754SJeff.Bonwick@Sun.COM if (total_errors == rm->rm_firstdatacol) 18647754SJeff.Bonwick@Sun.COM zio->io_error = zio_worst_error(zio->io_error, ECKSUM); 18652082Seschrock 186610105Sadam.leventhal@sun.com } else if ((code = vdev_raidz_combrec(zio, total_errors, 186710105Sadam.leventhal@sun.com data_errors)) != 0) { 18682082Seschrock /* 186910105Sadam.leventhal@sun.com * If we didn't use all the available parity for the 187010105Sadam.leventhal@sun.com * combinatorial reconstruction, verify that the remaining 187110105Sadam.leventhal@sun.com * parity is correct. 18722082Seschrock */ 187310105Sadam.leventhal@sun.com if (code != (1 << rm->rm_firstdatacol) - 1) 187410105Sadam.leventhal@sun.com (void) raidz_parity_verify(zio, rm); 187510105Sadam.leventhal@sun.com } else { 187610105Sadam.leventhal@sun.com /* 187710105Sadam.leventhal@sun.com * All combinations failed to checksum. Generate checksum 187810105Sadam.leventhal@sun.com * ereports for all children. 187910105Sadam.leventhal@sun.com */ 188010105Sadam.leventhal@sun.com zio->io_error = ECKSUM; 18812082Seschrock 188210105Sadam.leventhal@sun.com if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 188310105Sadam.leventhal@sun.com for (c = 0; c < rm->rm_cols; c++) { 188410105Sadam.leventhal@sun.com rc = &rm->rm_col[c]; 188510105Sadam.leventhal@sun.com zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 188610105Sadam.leventhal@sun.com zio->io_spa, vd->vdev_child[rc->rc_devidx], 188710105Sadam.leventhal@sun.com zio, rc->rc_offset, rc->rc_size); 18882082Seschrock } 18891544Seschrock } 18901544Seschrock } 1891789Sahrens 1892789Sahrens done: 1893789Sahrens zio_checksum_verified(zio); 1894789Sahrens 18958241SJeff.Bonwick@Sun.COM if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 1896789Sahrens (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 1897789Sahrens /* 1898789Sahrens * Use the good data we have in hand to repair damaged children. 1899789Sahrens */ 1900789Sahrens for (c = 0; c < rm->rm_cols; c++) { 1901789Sahrens rc = &rm->rm_col[c]; 19022082Seschrock cvd = vd->vdev_child[rc->rc_devidx]; 1903789Sahrens 19041732Sbonwick if (rc->rc_error == 0) 19051732Sbonwick continue; 19061732Sbonwick 19077754SJeff.Bonwick@Sun.COM zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 19081732Sbonwick rc->rc_offset, rc->rc_data, rc->rc_size, 19091732Sbonwick ZIO_TYPE_WRITE, zio->io_priority, 19108241SJeff.Bonwick@Sun.COM ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 19118241SJeff.Bonwick@Sun.COM ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 19121732Sbonwick } 1913789Sahrens } 1914789Sahrens } 1915789Sahrens 1916789Sahrens static void 1917789Sahrens vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 1918789Sahrens { 19192082Seschrock if (faulted > vd->vdev_nparity) 19201544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 19211544Seschrock VDEV_AUX_NO_REPLICAS); 1922789Sahrens else if (degraded + faulted != 0) 19231544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 1924789Sahrens else 19251544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 1926789Sahrens } 1927789Sahrens 1928789Sahrens vdev_ops_t vdev_raidz_ops = { 1929789Sahrens vdev_raidz_open, 1930789Sahrens vdev_raidz_close, 1931789Sahrens vdev_raidz_asize, 1932789Sahrens vdev_raidz_io_start, 1933789Sahrens vdev_raidz_io_done, 1934789Sahrens vdev_raidz_state_change, 1935789Sahrens VDEV_TYPE_RAIDZ, /* name of this vdev type */ 1936789Sahrens B_FALSE /* not a leaf vdev */ 1937789Sahrens }; 1938