1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 212082Seschrock 22789Sahrens /* 239434SMark.Musante@Sun.COM * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24789Sahrens * Use is subject to license terms. 25789Sahrens */ 26789Sahrens 27789Sahrens #include <sys/zfs_context.h> 28789Sahrens #include <sys/spa.h> 29789Sahrens #include <sys/vdev_impl.h> 30789Sahrens #include <sys/zio.h> 31789Sahrens #include <sys/zio_checksum.h> 32789Sahrens #include <sys/fs/zfs.h> 331544Seschrock #include <sys/fm/fs/zfs.h> 34789Sahrens 35789Sahrens /* 36789Sahrens * Virtual device vector for RAID-Z. 372082Seschrock * 3810105Sadam.leventhal@sun.com * This vdev supports single, double, and triple parity. For single parity, 3910105Sadam.leventhal@sun.com * we use a simple XOR of all the data columns. For double or triple parity, 4010105Sadam.leventhal@sun.com * we use a special case of Reed-Solomon coding. This extends the 4110105Sadam.leventhal@sun.com * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 4210105Sadam.leventhal@sun.com * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 4310105Sadam.leventhal@sun.com * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 4410105Sadam.leventhal@sun.com * former is also based. The latter is designed to provide higher performance 4510105Sadam.leventhal@sun.com * for writes. 4610105Sadam.leventhal@sun.com * 4710105Sadam.leventhal@sun.com * Note that the Plank paper claimed to support arbitrary N+M, but was then 4810105Sadam.leventhal@sun.com * amended six years later identifying a critical flaw that invalidates its 4910105Sadam.leventhal@sun.com * claims. Nevertheless, the technique can be adapted to work for up to 5010105Sadam.leventhal@sun.com * triple parity. For additional parity, the amendment "Note: Correction to 5110105Sadam.leventhal@sun.com * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 5210105Sadam.leventhal@sun.com * is viable, but the additional complexity means that write performance will 5310105Sadam.leventhal@sun.com * suffer. 5410105Sadam.leventhal@sun.com * 5510105Sadam.leventhal@sun.com * All of the methods above operate on a Galois field, defined over the 5610105Sadam.leventhal@sun.com * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 5710105Sadam.leventhal@sun.com * can be expressed with a single byte. Briefly, the operations on the 5810105Sadam.leventhal@sun.com * field are defined as follows: 592082Seschrock * 602082Seschrock * o addition (+) is represented by a bitwise XOR 612082Seschrock * o subtraction (-) is therefore identical to addition: A + B = A - B 622082Seschrock * o multiplication of A by 2 is defined by the following bitwise expression: 632082Seschrock * (A * 2)_7 = A_6 642082Seschrock * (A * 2)_6 = A_5 652082Seschrock * (A * 2)_5 = A_4 662082Seschrock * (A * 2)_4 = A_3 + A_7 672082Seschrock * (A * 2)_3 = A_2 + A_7 682082Seschrock * (A * 2)_2 = A_1 + A_7 692082Seschrock * (A * 2)_1 = A_0 702082Seschrock * (A * 2)_0 = A_7 712082Seschrock * 722082Seschrock * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 7310105Sadam.leventhal@sun.com * As an aside, this multiplication is derived from the error correcting 7410105Sadam.leventhal@sun.com * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 752082Seschrock * 762082Seschrock * Observe that any number in the field (except for 0) can be expressed as a 772082Seschrock * power of 2 -- a generator for the field. We store a table of the powers of 782082Seschrock * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 792082Seschrock * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 8010105Sadam.leventhal@sun.com * than field addition). The inverse of a field element A (A^-1) is therefore 8110105Sadam.leventhal@sun.com * A ^ (255 - 1) = A^254. 822082Seschrock * 8310105Sadam.leventhal@sun.com * The up-to-three parity columns, P, Q, R over several data columns, 8410105Sadam.leventhal@sun.com * D_0, ... D_n-1, can be expressed by field operations: 852082Seschrock * 862082Seschrock * P = D_0 + D_1 + ... + D_n-2 + D_n-1 872082Seschrock * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 882082Seschrock * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 8910105Sadam.leventhal@sun.com * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 9010105Sadam.leventhal@sun.com * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 912082Seschrock * 9210105Sadam.leventhal@sun.com * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 9310105Sadam.leventhal@sun.com * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 9410105Sadam.leventhal@sun.com * independent coefficients. (There are no additional coefficients that have 9510105Sadam.leventhal@sun.com * this property which is why the uncorrected Plank method breaks down.) 9610105Sadam.leventhal@sun.com * 9710105Sadam.leventhal@sun.com * See the reconstruction code below for how P, Q and R can used individually 9810105Sadam.leventhal@sun.com * or in concert to recover missing data columns. 99789Sahrens */ 100789Sahrens 101789Sahrens typedef struct raidz_col { 1022082Seschrock uint64_t rc_devidx; /* child device index for I/O */ 1032082Seschrock uint64_t rc_offset; /* device offset */ 1042082Seschrock uint64_t rc_size; /* I/O size */ 1052082Seschrock void *rc_data; /* I/O data */ 106*10614SJonathan.Adams@Sun.COM void *rc_gdata; /* used to store the "good" version */ 1072082Seschrock int rc_error; /* I/O error for this device */ 1082082Seschrock uint8_t rc_tried; /* Did we attempt this I/O column? */ 1092082Seschrock uint8_t rc_skipped; /* Did we skip this I/O column? */ 110789Sahrens } raidz_col_t; 111789Sahrens 112789Sahrens typedef struct raidz_map { 11310105Sadam.leventhal@sun.com uint64_t rm_cols; /* Regular column count */ 11410105Sadam.leventhal@sun.com uint64_t rm_scols; /* Count including skipped columns */ 1152082Seschrock uint64_t rm_bigcols; /* Number of oversized columns */ 1162082Seschrock uint64_t rm_asize; /* Actual total I/O size */ 1172082Seschrock uint64_t rm_missingdata; /* Count of missing data devices */ 1182082Seschrock uint64_t rm_missingparity; /* Count of missing parity devices */ 1192082Seschrock uint64_t rm_firstdatacol; /* First data column/parity count */ 12010450Sadam.leventhal@sun.com uint64_t rm_nskip; /* Skipped sectors for padding */ 12110450Sadam.leventhal@sun.com uint64_t rm_skipstart; /* Column index of padding start */ 122*10614SJonathan.Adams@Sun.COM void *rm_datacopy; /* rm_asize-buffer of copied data */ 123*10614SJonathan.Adams@Sun.COM uintptr_t rm_reports; /* # of referencing checksum reports */ 124*10614SJonathan.Adams@Sun.COM uint8_t rm_freed; /* map no longer has referencing ZIO */ 125*10614SJonathan.Adams@Sun.COM uint8_t rm_ecksuminjected; /* checksum error was injected */ 126*10614SJonathan.Adams@Sun.COM uint64_t rm_skipped; /* Skipped sectors for padding */ 1272082Seschrock raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 128789Sahrens } raidz_map_t; 129789Sahrens 1302082Seschrock #define VDEV_RAIDZ_P 0 1312082Seschrock #define VDEV_RAIDZ_Q 1 13210105Sadam.leventhal@sun.com #define VDEV_RAIDZ_R 2 13310105Sadam.leventhal@sun.com #define VDEV_RAIDZ_MAXPARITY 3 1342082Seschrock 13510105Sadam.leventhal@sun.com #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 13610105Sadam.leventhal@sun.com #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 1372082Seschrock 13810105Sadam.leventhal@sun.com /* 13910105Sadam.leventhal@sun.com * We provide a mechanism to perform the field multiplication operation on a 14010105Sadam.leventhal@sun.com * 64-bit value all at once rather than a byte at a time. This works by 14110105Sadam.leventhal@sun.com * creating a mask from the top bit in each byte and using that to 14210105Sadam.leventhal@sun.com * conditionally apply the XOR of 0x1d. 14310105Sadam.leventhal@sun.com */ 14410105Sadam.leventhal@sun.com #define VDEV_RAIDZ_64MUL_2(x, mask) \ 14510105Sadam.leventhal@sun.com { \ 14610105Sadam.leventhal@sun.com (mask) = (x) & 0x8080808080808080ULL; \ 14710105Sadam.leventhal@sun.com (mask) = ((mask) << 1) - ((mask) >> 7); \ 14810105Sadam.leventhal@sun.com (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 14910105Sadam.leventhal@sun.com ((mask) & 0x1d1d1d1d1d1d1d1d); \ 15010105Sadam.leventhal@sun.com } 15110105Sadam.leventhal@sun.com 15210105Sadam.leventhal@sun.com #define VDEV_RAIDZ_64MUL_4(x, mask) \ 15310105Sadam.leventhal@sun.com { \ 15410105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2((x), mask); \ 15510105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2((x), mask); \ 15610105Sadam.leventhal@sun.com } 15710105Sadam.leventhal@sun.com 15810105Sadam.leventhal@sun.com /* 15910105Sadam.leventhal@sun.com * Force reconstruction to use the general purpose method. 16010105Sadam.leventhal@sun.com */ 16110105Sadam.leventhal@sun.com int vdev_raidz_default_to_general; 1622082Seschrock 1632082Seschrock /* 1642082Seschrock * These two tables represent powers and logs of 2 in the Galois field defined 1652082Seschrock * above. These values were computed by repeatedly multiplying by 2 as above. 1662082Seschrock */ 1672082Seschrock static const uint8_t vdev_raidz_pow2[256] = { 1682082Seschrock 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 1692082Seschrock 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 1702082Seschrock 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 1712082Seschrock 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 1722082Seschrock 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 1732082Seschrock 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 1742082Seschrock 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 1752082Seschrock 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 1762082Seschrock 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 1772082Seschrock 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 1782082Seschrock 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 1792082Seschrock 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 1802082Seschrock 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 1812082Seschrock 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 1822082Seschrock 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 1832082Seschrock 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 1842082Seschrock 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 1852082Seschrock 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 1862082Seschrock 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 1872082Seschrock 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 1882082Seschrock 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 1892082Seschrock 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 1902082Seschrock 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 1912082Seschrock 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 1922082Seschrock 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 1932082Seschrock 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 1942082Seschrock 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 1952082Seschrock 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 1962082Seschrock 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 1972082Seschrock 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 1982082Seschrock 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 1992082Seschrock 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 2002082Seschrock }; 2012082Seschrock static const uint8_t vdev_raidz_log2[256] = { 2022082Seschrock 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 2032082Seschrock 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 2042082Seschrock 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 2052082Seschrock 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 2062082Seschrock 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 2072082Seschrock 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 2082082Seschrock 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 2092082Seschrock 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 2102082Seschrock 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 2112082Seschrock 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 2122082Seschrock 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 2132082Seschrock 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 2142082Seschrock 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 2152082Seschrock 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 2162082Seschrock 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 2172082Seschrock 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 2182082Seschrock 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 2192082Seschrock 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 2202082Seschrock 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 2212082Seschrock 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 2222082Seschrock 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 2232082Seschrock 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 2242082Seschrock 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 2252082Seschrock 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 2262082Seschrock 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 2272082Seschrock 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 2282082Seschrock 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 2292082Seschrock 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 2302082Seschrock 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 2312082Seschrock 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 2322082Seschrock 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 2332082Seschrock 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 2342082Seschrock }; 2352082Seschrock 236*10614SJonathan.Adams@Sun.COM static void vdev_raidz_generate_parity(raidz_map_t *rm); 237*10614SJonathan.Adams@Sun.COM 2382082Seschrock /* 2392082Seschrock * Multiply a given number by 2 raised to the given power. 2402082Seschrock */ 2412082Seschrock static uint8_t 2422082Seschrock vdev_raidz_exp2(uint_t a, int exp) 2432082Seschrock { 2442082Seschrock if (a == 0) 2452082Seschrock return (0); 2462082Seschrock 2472082Seschrock ASSERT(exp >= 0); 2482082Seschrock ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 2492082Seschrock 2502082Seschrock exp += vdev_raidz_log2[a]; 2512082Seschrock if (exp > 255) 2522082Seschrock exp -= 255; 2532082Seschrock 2542082Seschrock return (vdev_raidz_pow2[exp]); 2552082Seschrock } 2562082Seschrock 2577754SJeff.Bonwick@Sun.COM static void 258*10614SJonathan.Adams@Sun.COM vdev_raidz_map_free(raidz_map_t *rm) 2597754SJeff.Bonwick@Sun.COM { 2607754SJeff.Bonwick@Sun.COM int c; 261*10614SJonathan.Adams@Sun.COM size_t size = rm->rm_asize; /* will hold data-size after the loop */ 2627754SJeff.Bonwick@Sun.COM 263*10614SJonathan.Adams@Sun.COM for (c = 0; c < rm->rm_firstdatacol; c++) { 264*10614SJonathan.Adams@Sun.COM size -= rm->rm_col[c].rc_size; 265*10614SJonathan.Adams@Sun.COM 2667754SJeff.Bonwick@Sun.COM zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 2677754SJeff.Bonwick@Sun.COM 268*10614SJonathan.Adams@Sun.COM if (rm->rm_col[c].rc_gdata != NULL) 269*10614SJonathan.Adams@Sun.COM zio_buf_free(rm->rm_col[c].rc_gdata, 270*10614SJonathan.Adams@Sun.COM rm->rm_col[c].rc_size); 271*10614SJonathan.Adams@Sun.COM } 272*10614SJonathan.Adams@Sun.COM 273*10614SJonathan.Adams@Sun.COM if (rm->rm_datacopy != NULL) 274*10614SJonathan.Adams@Sun.COM zio_buf_free(rm->rm_datacopy, size); 275*10614SJonathan.Adams@Sun.COM 27610105Sadam.leventhal@sun.com kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 2777754SJeff.Bonwick@Sun.COM } 2787754SJeff.Bonwick@Sun.COM 279*10614SJonathan.Adams@Sun.COM static void 280*10614SJonathan.Adams@Sun.COM vdev_raidz_map_free_vsd(zio_t *zio) 281*10614SJonathan.Adams@Sun.COM { 282*10614SJonathan.Adams@Sun.COM raidz_map_t *rm = zio->io_vsd; 283*10614SJonathan.Adams@Sun.COM 284*10614SJonathan.Adams@Sun.COM ASSERT3U(rm->rm_freed, ==, 0); 285*10614SJonathan.Adams@Sun.COM rm->rm_freed = 1; 286*10614SJonathan.Adams@Sun.COM 287*10614SJonathan.Adams@Sun.COM if (rm->rm_reports == 0) 288*10614SJonathan.Adams@Sun.COM vdev_raidz_map_free(rm); 289*10614SJonathan.Adams@Sun.COM } 290*10614SJonathan.Adams@Sun.COM 291*10614SJonathan.Adams@Sun.COM /*ARGSUSED*/ 292*10614SJonathan.Adams@Sun.COM static void 293*10614SJonathan.Adams@Sun.COM vdev_raidz_cksum_free(void *arg, size_t ignored) 294*10614SJonathan.Adams@Sun.COM { 295*10614SJonathan.Adams@Sun.COM raidz_map_t *rm = arg; 296*10614SJonathan.Adams@Sun.COM 297*10614SJonathan.Adams@Sun.COM ASSERT3U(rm->rm_reports, >, 0); 298*10614SJonathan.Adams@Sun.COM ASSERT3U(rm->rm_freed, !=, 0); 299*10614SJonathan.Adams@Sun.COM 300*10614SJonathan.Adams@Sun.COM if (--rm->rm_reports == 0) 301*10614SJonathan.Adams@Sun.COM vdev_raidz_map_free(rm); 302*10614SJonathan.Adams@Sun.COM } 303*10614SJonathan.Adams@Sun.COM 304*10614SJonathan.Adams@Sun.COM static void 305*10614SJonathan.Adams@Sun.COM vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) 306*10614SJonathan.Adams@Sun.COM { 307*10614SJonathan.Adams@Sun.COM raidz_map_t *rm = zcr->zcr_cbdata; 308*10614SJonathan.Adams@Sun.COM size_t c = zcr->zcr_cbinfo; 309*10614SJonathan.Adams@Sun.COM size_t x; 310*10614SJonathan.Adams@Sun.COM 311*10614SJonathan.Adams@Sun.COM const char *good = NULL; 312*10614SJonathan.Adams@Sun.COM const char *bad = rm->rm_col[c].rc_data; 313*10614SJonathan.Adams@Sun.COM 314*10614SJonathan.Adams@Sun.COM if (good_data == NULL) { 315*10614SJonathan.Adams@Sun.COM zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 316*10614SJonathan.Adams@Sun.COM return; 317*10614SJonathan.Adams@Sun.COM } 318*10614SJonathan.Adams@Sun.COM 319*10614SJonathan.Adams@Sun.COM if (c < rm->rm_firstdatacol) { 320*10614SJonathan.Adams@Sun.COM /* 321*10614SJonathan.Adams@Sun.COM * The first time through, calculate the parity blocks for 322*10614SJonathan.Adams@Sun.COM * the good data (this relies on the fact that the good 323*10614SJonathan.Adams@Sun.COM * data never changes for a given logical ZIO) 324*10614SJonathan.Adams@Sun.COM */ 325*10614SJonathan.Adams@Sun.COM if (rm->rm_col[0].rc_gdata == NULL) { 326*10614SJonathan.Adams@Sun.COM char *bad_parity[VDEV_RAIDZ_MAXPARITY]; 327*10614SJonathan.Adams@Sun.COM char *buf; 328*10614SJonathan.Adams@Sun.COM 329*10614SJonathan.Adams@Sun.COM /* 330*10614SJonathan.Adams@Sun.COM * Set up the rm_col[]s to generate the parity for 331*10614SJonathan.Adams@Sun.COM * good_data, first saving the parity bufs and 332*10614SJonathan.Adams@Sun.COM * replacing them with buffers to hold the result. 333*10614SJonathan.Adams@Sun.COM */ 334*10614SJonathan.Adams@Sun.COM for (x = 0; x < rm->rm_firstdatacol; x++) { 335*10614SJonathan.Adams@Sun.COM bad_parity[x] = rm->rm_col[x].rc_data; 336*10614SJonathan.Adams@Sun.COM rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = 337*10614SJonathan.Adams@Sun.COM zio_buf_alloc(rm->rm_col[x].rc_size); 338*10614SJonathan.Adams@Sun.COM } 339*10614SJonathan.Adams@Sun.COM 340*10614SJonathan.Adams@Sun.COM /* fill in the data columns from good_data */ 341*10614SJonathan.Adams@Sun.COM buf = (char *)good_data; 342*10614SJonathan.Adams@Sun.COM for (; x < rm->rm_cols; x++) { 343*10614SJonathan.Adams@Sun.COM rm->rm_col[x].rc_data = buf; 344*10614SJonathan.Adams@Sun.COM buf += rm->rm_col[x].rc_size; 345*10614SJonathan.Adams@Sun.COM } 346*10614SJonathan.Adams@Sun.COM 347*10614SJonathan.Adams@Sun.COM /* 348*10614SJonathan.Adams@Sun.COM * Construct the parity from the good data. 349*10614SJonathan.Adams@Sun.COM */ 350*10614SJonathan.Adams@Sun.COM vdev_raidz_generate_parity(rm); 351*10614SJonathan.Adams@Sun.COM 352*10614SJonathan.Adams@Sun.COM /* restore everything back to its original state */ 353*10614SJonathan.Adams@Sun.COM for (x = 0; x < rm->rm_firstdatacol; x++) 354*10614SJonathan.Adams@Sun.COM rm->rm_col[x].rc_data = bad_parity[x]; 355*10614SJonathan.Adams@Sun.COM 356*10614SJonathan.Adams@Sun.COM buf = rm->rm_datacopy; 357*10614SJonathan.Adams@Sun.COM for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 358*10614SJonathan.Adams@Sun.COM rm->rm_col[x].rc_data = buf; 359*10614SJonathan.Adams@Sun.COM buf += rm->rm_col[x].rc_size; 360*10614SJonathan.Adams@Sun.COM } 361*10614SJonathan.Adams@Sun.COM } 362*10614SJonathan.Adams@Sun.COM 363*10614SJonathan.Adams@Sun.COM ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 364*10614SJonathan.Adams@Sun.COM good = rm->rm_col[c].rc_gdata; 365*10614SJonathan.Adams@Sun.COM } else { 366*10614SJonathan.Adams@Sun.COM /* adjust good_data to point at the start of our column */ 367*10614SJonathan.Adams@Sun.COM good = good_data; 368*10614SJonathan.Adams@Sun.COM 369*10614SJonathan.Adams@Sun.COM for (x = rm->rm_firstdatacol; x < c; x++) 370*10614SJonathan.Adams@Sun.COM good += rm->rm_col[x].rc_size; 371*10614SJonathan.Adams@Sun.COM } 372*10614SJonathan.Adams@Sun.COM 373*10614SJonathan.Adams@Sun.COM /* we drop the ereport if it ends up that the data was good */ 374*10614SJonathan.Adams@Sun.COM zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 375*10614SJonathan.Adams@Sun.COM } 376*10614SJonathan.Adams@Sun.COM 377*10614SJonathan.Adams@Sun.COM /* 378*10614SJonathan.Adams@Sun.COM * Invoked indirectly by zfs_ereport_start_checksum(), called 379*10614SJonathan.Adams@Sun.COM * below when our read operation fails completely. The main point 380*10614SJonathan.Adams@Sun.COM * is to keep a copy of everything we read from disk, so that at 381*10614SJonathan.Adams@Sun.COM * vdev_raidz_cksum_finish() time we can compare it with the good data. 382*10614SJonathan.Adams@Sun.COM */ 383*10614SJonathan.Adams@Sun.COM static void 384*10614SJonathan.Adams@Sun.COM vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 385*10614SJonathan.Adams@Sun.COM { 386*10614SJonathan.Adams@Sun.COM size_t c = (size_t)(uintptr_t)arg; 387*10614SJonathan.Adams@Sun.COM caddr_t buf; 388*10614SJonathan.Adams@Sun.COM 389*10614SJonathan.Adams@Sun.COM raidz_map_t *rm = zio->io_vsd; 390*10614SJonathan.Adams@Sun.COM size_t size; 391*10614SJonathan.Adams@Sun.COM 392*10614SJonathan.Adams@Sun.COM /* set up the report and bump the refcount */ 393*10614SJonathan.Adams@Sun.COM zcr->zcr_cbdata = rm; 394*10614SJonathan.Adams@Sun.COM zcr->zcr_cbinfo = c; 395*10614SJonathan.Adams@Sun.COM zcr->zcr_finish = vdev_raidz_cksum_finish; 396*10614SJonathan.Adams@Sun.COM zcr->zcr_free = vdev_raidz_cksum_free; 397*10614SJonathan.Adams@Sun.COM 398*10614SJonathan.Adams@Sun.COM rm->rm_reports++; 399*10614SJonathan.Adams@Sun.COM ASSERT3U(rm->rm_reports, >, 0); 400*10614SJonathan.Adams@Sun.COM 401*10614SJonathan.Adams@Sun.COM if (rm->rm_reports != 1) 402*10614SJonathan.Adams@Sun.COM return; 403*10614SJonathan.Adams@Sun.COM 404*10614SJonathan.Adams@Sun.COM /* 405*10614SJonathan.Adams@Sun.COM * It's the first time we're called, so we need to copy the data 406*10614SJonathan.Adams@Sun.COM * aside; there's no guarantee that our zio's buffer won't be 407*10614SJonathan.Adams@Sun.COM * re-used for something else. 408*10614SJonathan.Adams@Sun.COM * 409*10614SJonathan.Adams@Sun.COM * Our parity data is already in seperate buffers, so there's no need 410*10614SJonathan.Adams@Sun.COM * to copy them. 411*10614SJonathan.Adams@Sun.COM */ 412*10614SJonathan.Adams@Sun.COM ASSERT3P(rm->rm_datacopy, ==, NULL); 413*10614SJonathan.Adams@Sun.COM 414*10614SJonathan.Adams@Sun.COM /* rm_asize includes the parity blocks; subtract them out */ 415*10614SJonathan.Adams@Sun.COM size = rm->rm_asize; 416*10614SJonathan.Adams@Sun.COM for (c = 0; c < rm->rm_firstdatacol; c++) 417*10614SJonathan.Adams@Sun.COM size -= rm->rm_col[c].rc_size; 418*10614SJonathan.Adams@Sun.COM 419*10614SJonathan.Adams@Sun.COM buf = rm->rm_datacopy = zio_buf_alloc(size); 420*10614SJonathan.Adams@Sun.COM for (; c < rm->rm_cols; c++) { 421*10614SJonathan.Adams@Sun.COM raidz_col_t *col = &rm->rm_col[c]; 422*10614SJonathan.Adams@Sun.COM 423*10614SJonathan.Adams@Sun.COM bcopy(col->rc_data, buf, col->rc_size); 424*10614SJonathan.Adams@Sun.COM col->rc_data = buf; 425*10614SJonathan.Adams@Sun.COM 426*10614SJonathan.Adams@Sun.COM buf += col->rc_size; 427*10614SJonathan.Adams@Sun.COM } 428*10614SJonathan.Adams@Sun.COM ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); 429*10614SJonathan.Adams@Sun.COM } 430*10614SJonathan.Adams@Sun.COM 431*10614SJonathan.Adams@Sun.COM static const zio_vsd_ops_t vdev_raidz_vsd_ops = { 432*10614SJonathan.Adams@Sun.COM vdev_raidz_map_free_vsd, 433*10614SJonathan.Adams@Sun.COM vdev_raidz_cksum_report 434*10614SJonathan.Adams@Sun.COM }; 435*10614SJonathan.Adams@Sun.COM 436789Sahrens static raidz_map_t * 4372082Seschrock vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, 4382082Seschrock uint64_t nparity) 439789Sahrens { 440789Sahrens raidz_map_t *rm; 441789Sahrens uint64_t b = zio->io_offset >> unit_shift; 442789Sahrens uint64_t s = zio->io_size >> unit_shift; 443789Sahrens uint64_t f = b % dcols; 444789Sahrens uint64_t o = (b / dcols) << unit_shift; 44510105Sadam.leventhal@sun.com uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 446789Sahrens 4472082Seschrock q = s / (dcols - nparity); 4482082Seschrock r = s - q * (dcols - nparity); 4492082Seschrock bc = (r == 0 ? 0 : r + nparity); 45010105Sadam.leventhal@sun.com tot = s + nparity * (q + (r == 0 ? 0 : 1)); 451789Sahrens 45210105Sadam.leventhal@sun.com if (q == 0) { 45310105Sadam.leventhal@sun.com acols = bc; 45410105Sadam.leventhal@sun.com scols = MIN(dcols, roundup(bc, nparity + 1)); 45510105Sadam.leventhal@sun.com } else { 45610105Sadam.leventhal@sun.com acols = dcols; 45710105Sadam.leventhal@sun.com scols = dcols; 45810105Sadam.leventhal@sun.com } 459789Sahrens 46010105Sadam.leventhal@sun.com ASSERT3U(acols, <=, scols); 46110105Sadam.leventhal@sun.com 46210105Sadam.leventhal@sun.com rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 463789Sahrens 464789Sahrens rm->rm_cols = acols; 46510105Sadam.leventhal@sun.com rm->rm_scols = scols; 466789Sahrens rm->rm_bigcols = bc; 46710450Sadam.leventhal@sun.com rm->rm_skipstart = bc; 4682082Seschrock rm->rm_missingdata = 0; 4692082Seschrock rm->rm_missingparity = 0; 4702082Seschrock rm->rm_firstdatacol = nparity; 471*10614SJonathan.Adams@Sun.COM rm->rm_datacopy = NULL; 472*10614SJonathan.Adams@Sun.COM rm->rm_reports = 0; 473*10614SJonathan.Adams@Sun.COM rm->rm_freed = 0; 474*10614SJonathan.Adams@Sun.COM rm->rm_ecksuminjected = 0; 475789Sahrens 47610105Sadam.leventhal@sun.com asize = 0; 47710105Sadam.leventhal@sun.com 47810105Sadam.leventhal@sun.com for (c = 0; c < scols; c++) { 479789Sahrens col = f + c; 480789Sahrens coff = o; 481789Sahrens if (col >= dcols) { 482789Sahrens col -= dcols; 483789Sahrens coff += 1ULL << unit_shift; 484789Sahrens } 4852082Seschrock rm->rm_col[c].rc_devidx = col; 486789Sahrens rm->rm_col[c].rc_offset = coff; 487789Sahrens rm->rm_col[c].rc_data = NULL; 488*10614SJonathan.Adams@Sun.COM rm->rm_col[c].rc_gdata = NULL; 489789Sahrens rm->rm_col[c].rc_error = 0; 490789Sahrens rm->rm_col[c].rc_tried = 0; 491789Sahrens rm->rm_col[c].rc_skipped = 0; 49210105Sadam.leventhal@sun.com 49310105Sadam.leventhal@sun.com if (c >= acols) 49410105Sadam.leventhal@sun.com rm->rm_col[c].rc_size = 0; 49510105Sadam.leventhal@sun.com else if (c < bc) 49610105Sadam.leventhal@sun.com rm->rm_col[c].rc_size = (q + 1) << unit_shift; 49710105Sadam.leventhal@sun.com else 49810105Sadam.leventhal@sun.com rm->rm_col[c].rc_size = q << unit_shift; 49910105Sadam.leventhal@sun.com 50010105Sadam.leventhal@sun.com asize += rm->rm_col[c].rc_size; 501789Sahrens } 502789Sahrens 50310105Sadam.leventhal@sun.com ASSERT3U(asize, ==, tot << unit_shift); 50410105Sadam.leventhal@sun.com rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 50510450Sadam.leventhal@sun.com rm->rm_nskip = roundup(tot, nparity + 1) - tot; 50610450Sadam.leventhal@sun.com ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 50710450Sadam.leventhal@sun.com ASSERT3U(rm->rm_nskip, <=, nparity); 508789Sahrens 509789Sahrens for (c = 0; c < rm->rm_firstdatacol; c++) 510789Sahrens rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); 511789Sahrens 512789Sahrens rm->rm_col[c].rc_data = zio->io_data; 513789Sahrens 514789Sahrens for (c = c + 1; c < acols; c++) 515789Sahrens rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 516789Sahrens rm->rm_col[c - 1].rc_size; 517789Sahrens 5181133Seschrock /* 5192082Seschrock * If all data stored spans all columns, there's a danger that parity 5202082Seschrock * will always be on the same device and, since parity isn't read 5212082Seschrock * during normal operation, that that device's I/O bandwidth won't be 5222082Seschrock * used effectively. We therefore switch the parity every 1MB. 5232082Seschrock * 5242082Seschrock * ... at least that was, ostensibly, the theory. As a practical 5252082Seschrock * matter unless we juggle the parity between all devices evenly, we 5262082Seschrock * won't see any benefit. Further, occasional writes that aren't a 5272082Seschrock * multiple of the LCM of the number of children and the minimum 5282082Seschrock * stripe width are sufficient to avoid pessimal behavior. 5292082Seschrock * Unfortunately, this decision created an implicit on-disk format 5303456Sahl * requirement that we need to support for all eternity, but only 5313456Sahl * for single-parity RAID-Z. 53210450Sadam.leventhal@sun.com * 53310450Sadam.leventhal@sun.com * If we intend to skip a sector in the zeroth column for padding 53410450Sadam.leventhal@sun.com * we must make sure to note this swap. We will never intend to 53510450Sadam.leventhal@sun.com * skip the first column since at least one data and one parity 53610450Sadam.leventhal@sun.com * column must appear in each row. 5371133Seschrock */ 5381133Seschrock ASSERT(rm->rm_cols >= 2); 5391133Seschrock ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 540789Sahrens 5412082Seschrock if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 5422082Seschrock devidx = rm->rm_col[0].rc_devidx; 5431133Seschrock o = rm->rm_col[0].rc_offset; 5442082Seschrock rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 5451133Seschrock rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 5462082Seschrock rm->rm_col[1].rc_devidx = devidx; 5471133Seschrock rm->rm_col[1].rc_offset = o; 54810450Sadam.leventhal@sun.com 54910450Sadam.leventhal@sun.com if (rm->rm_skipstart == 0) 55010450Sadam.leventhal@sun.com rm->rm_skipstart = 1; 551789Sahrens } 552789Sahrens 553789Sahrens zio->io_vsd = rm; 554*10614SJonathan.Adams@Sun.COM zio->io_vsd_ops = &vdev_raidz_vsd_ops; 555789Sahrens return (rm); 556789Sahrens } 557789Sahrens 558789Sahrens static void 5592082Seschrock vdev_raidz_generate_parity_p(raidz_map_t *rm) 5602082Seschrock { 5612082Seschrock uint64_t *p, *src, pcount, ccount, i; 5622082Seschrock int c; 5632082Seschrock 5642082Seschrock pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 5652082Seschrock 5662082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 5672082Seschrock src = rm->rm_col[c].rc_data; 5682082Seschrock p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 5692082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 5702082Seschrock 5712082Seschrock if (c == rm->rm_firstdatacol) { 5722082Seschrock ASSERT(ccount == pcount); 57310105Sadam.leventhal@sun.com for (i = 0; i < ccount; i++, src++, p++) { 5742082Seschrock *p = *src; 5752082Seschrock } 5762082Seschrock } else { 5772082Seschrock ASSERT(ccount <= pcount); 57810105Sadam.leventhal@sun.com for (i = 0; i < ccount; i++, src++, p++) { 5792082Seschrock *p ^= *src; 5802082Seschrock } 5812082Seschrock } 5822082Seschrock } 5832082Seschrock } 5842082Seschrock 5852082Seschrock static void 5862082Seschrock vdev_raidz_generate_parity_pq(raidz_map_t *rm) 587789Sahrens { 58810105Sadam.leventhal@sun.com uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 5892082Seschrock int c; 5902082Seschrock 59110105Sadam.leventhal@sun.com pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 5922082Seschrock ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 5932082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_size); 5942082Seschrock 5952082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 5962082Seschrock src = rm->rm_col[c].rc_data; 5972082Seschrock p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 5982082Seschrock q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 59910105Sadam.leventhal@sun.com 60010105Sadam.leventhal@sun.com ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 6012082Seschrock 6022082Seschrock if (c == rm->rm_firstdatacol) { 60310105Sadam.leventhal@sun.com ASSERT(ccnt == pcnt || ccnt == 0); 60410105Sadam.leventhal@sun.com for (i = 0; i < ccnt; i++, src++, p++, q++) { 6052082Seschrock *p = *src; 60610105Sadam.leventhal@sun.com *q = *src; 6072082Seschrock } 60810105Sadam.leventhal@sun.com for (; i < pcnt; i++, src++, p++, q++) { 60910105Sadam.leventhal@sun.com *p = 0; 6102082Seschrock *q = 0; 6112082Seschrock } 6122082Seschrock } else { 61310105Sadam.leventhal@sun.com ASSERT(ccnt <= pcnt); 614789Sahrens 6152082Seschrock /* 61610105Sadam.leventhal@sun.com * Apply the algorithm described above by multiplying 61710105Sadam.leventhal@sun.com * the previous result and adding in the new value. 6182082Seschrock */ 61910105Sadam.leventhal@sun.com for (i = 0; i < ccnt; i++, src++, p++, q++) { 62010105Sadam.leventhal@sun.com *p ^= *src; 62110105Sadam.leventhal@sun.com 62210105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*q, mask); 6232082Seschrock *q ^= *src; 6242082Seschrock } 6252082Seschrock 6262082Seschrock /* 6272082Seschrock * Treat short columns as though they are full of 0s. 62810105Sadam.leventhal@sun.com * Note that there's therefore nothing needed for P. 6292082Seschrock */ 63010105Sadam.leventhal@sun.com for (; i < pcnt; i++, q++) { 63110105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*q, mask); 6322082Seschrock } 6332082Seschrock } 6342082Seschrock } 6352082Seschrock } 6362082Seschrock 6372082Seschrock static void 63810105Sadam.leventhal@sun.com vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 63910105Sadam.leventhal@sun.com { 64010105Sadam.leventhal@sun.com uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 64110105Sadam.leventhal@sun.com int c; 64210105Sadam.leventhal@sun.com 64310105Sadam.leventhal@sun.com pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 64410105Sadam.leventhal@sun.com ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 64510105Sadam.leventhal@sun.com rm->rm_col[VDEV_RAIDZ_Q].rc_size); 64610105Sadam.leventhal@sun.com ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 64710105Sadam.leventhal@sun.com rm->rm_col[VDEV_RAIDZ_R].rc_size); 64810105Sadam.leventhal@sun.com 64910105Sadam.leventhal@sun.com for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 65010105Sadam.leventhal@sun.com src = rm->rm_col[c].rc_data; 65110105Sadam.leventhal@sun.com p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 65210105Sadam.leventhal@sun.com q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 65310105Sadam.leventhal@sun.com r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 65410105Sadam.leventhal@sun.com 65510105Sadam.leventhal@sun.com ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 65610105Sadam.leventhal@sun.com 65710105Sadam.leventhal@sun.com if (c == rm->rm_firstdatacol) { 65810105Sadam.leventhal@sun.com ASSERT(ccnt == pcnt || ccnt == 0); 65910105Sadam.leventhal@sun.com for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 66010105Sadam.leventhal@sun.com *p = *src; 66110105Sadam.leventhal@sun.com *q = *src; 66210105Sadam.leventhal@sun.com *r = *src; 66310105Sadam.leventhal@sun.com } 66410105Sadam.leventhal@sun.com for (; i < pcnt; i++, src++, p++, q++, r++) { 66510105Sadam.leventhal@sun.com *p = 0; 66610105Sadam.leventhal@sun.com *q = 0; 66710105Sadam.leventhal@sun.com *r = 0; 66810105Sadam.leventhal@sun.com } 66910105Sadam.leventhal@sun.com } else { 67010105Sadam.leventhal@sun.com ASSERT(ccnt <= pcnt); 67110105Sadam.leventhal@sun.com 67210105Sadam.leventhal@sun.com /* 67310105Sadam.leventhal@sun.com * Apply the algorithm described above by multiplying 67410105Sadam.leventhal@sun.com * the previous result and adding in the new value. 67510105Sadam.leventhal@sun.com */ 67610105Sadam.leventhal@sun.com for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 67710105Sadam.leventhal@sun.com *p ^= *src; 67810105Sadam.leventhal@sun.com 67910105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*q, mask); 68010105Sadam.leventhal@sun.com *q ^= *src; 68110105Sadam.leventhal@sun.com 68210105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_4(*r, mask); 68310105Sadam.leventhal@sun.com *r ^= *src; 68410105Sadam.leventhal@sun.com } 68510105Sadam.leventhal@sun.com 68610105Sadam.leventhal@sun.com /* 68710105Sadam.leventhal@sun.com * Treat short columns as though they are full of 0s. 68810105Sadam.leventhal@sun.com * Note that there's therefore nothing needed for P. 68910105Sadam.leventhal@sun.com */ 69010105Sadam.leventhal@sun.com for (; i < pcnt; i++, q++, r++) { 69110105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*q, mask); 69210105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_4(*r, mask); 69310105Sadam.leventhal@sun.com } 69410105Sadam.leventhal@sun.com } 69510105Sadam.leventhal@sun.com } 69610105Sadam.leventhal@sun.com } 69710105Sadam.leventhal@sun.com 69810105Sadam.leventhal@sun.com /* 69910105Sadam.leventhal@sun.com * Generate RAID parity in the first virtual columns according to the number of 70010105Sadam.leventhal@sun.com * parity columns available. 70110105Sadam.leventhal@sun.com */ 70210105Sadam.leventhal@sun.com static void 70310105Sadam.leventhal@sun.com vdev_raidz_generate_parity(raidz_map_t *rm) 70410105Sadam.leventhal@sun.com { 70510105Sadam.leventhal@sun.com switch (rm->rm_firstdatacol) { 70610105Sadam.leventhal@sun.com case 1: 70710105Sadam.leventhal@sun.com vdev_raidz_generate_parity_p(rm); 70810105Sadam.leventhal@sun.com break; 70910105Sadam.leventhal@sun.com case 2: 71010105Sadam.leventhal@sun.com vdev_raidz_generate_parity_pq(rm); 71110105Sadam.leventhal@sun.com break; 71210105Sadam.leventhal@sun.com case 3: 71310105Sadam.leventhal@sun.com vdev_raidz_generate_parity_pqr(rm); 71410105Sadam.leventhal@sun.com break; 71510105Sadam.leventhal@sun.com default: 71610105Sadam.leventhal@sun.com cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 71710105Sadam.leventhal@sun.com } 71810105Sadam.leventhal@sun.com } 71910105Sadam.leventhal@sun.com 72010105Sadam.leventhal@sun.com static int 72110105Sadam.leventhal@sun.com vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 7222082Seschrock { 7232082Seschrock uint64_t *dst, *src, xcount, ccount, count, i; 72410105Sadam.leventhal@sun.com int x = tgts[0]; 7252082Seschrock int c; 7262082Seschrock 72710105Sadam.leventhal@sun.com ASSERT(ntgts == 1); 72810105Sadam.leventhal@sun.com ASSERT(x >= rm->rm_firstdatacol); 72910105Sadam.leventhal@sun.com ASSERT(x < rm->rm_cols); 73010105Sadam.leventhal@sun.com 7312082Seschrock xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 7322082Seschrock ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 7332082Seschrock ASSERT(xcount > 0); 7342082Seschrock 7352082Seschrock src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 7362082Seschrock dst = rm->rm_col[x].rc_data; 7372082Seschrock for (i = 0; i < xcount; i++, dst++, src++) { 7382082Seschrock *dst = *src; 7392082Seschrock } 7402082Seschrock 7412082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 742789Sahrens src = rm->rm_col[c].rc_data; 743789Sahrens dst = rm->rm_col[x].rc_data; 7442082Seschrock 7452082Seschrock if (c == x) 7462082Seschrock continue; 7472082Seschrock 7482082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 7492082Seschrock count = MIN(ccount, xcount); 7502082Seschrock 7512082Seschrock for (i = 0; i < count; i++, dst++, src++) { 7522082Seschrock *dst ^= *src; 753789Sahrens } 754789Sahrens } 75510105Sadam.leventhal@sun.com 75610105Sadam.leventhal@sun.com return (1 << VDEV_RAIDZ_P); 757789Sahrens } 758789Sahrens 75910105Sadam.leventhal@sun.com static int 76010105Sadam.leventhal@sun.com vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 7612082Seschrock { 7622082Seschrock uint64_t *dst, *src, xcount, ccount, count, mask, i; 7632082Seschrock uint8_t *b; 76410105Sadam.leventhal@sun.com int x = tgts[0]; 7652082Seschrock int c, j, exp; 7662082Seschrock 76710105Sadam.leventhal@sun.com ASSERT(ntgts == 1); 76810105Sadam.leventhal@sun.com 7692082Seschrock xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 7702082Seschrock ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 7712082Seschrock 7722082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 7732082Seschrock src = rm->rm_col[c].rc_data; 7742082Seschrock dst = rm->rm_col[x].rc_data; 7752082Seschrock 7762082Seschrock if (c == x) 7772082Seschrock ccount = 0; 7782082Seschrock else 7792082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 7802082Seschrock 7812082Seschrock count = MIN(ccount, xcount); 7822082Seschrock 7832082Seschrock if (c == rm->rm_firstdatacol) { 7842082Seschrock for (i = 0; i < count; i++, dst++, src++) { 7852082Seschrock *dst = *src; 7862082Seschrock } 7872082Seschrock for (; i < xcount; i++, dst++) { 7882082Seschrock *dst = 0; 7892082Seschrock } 7902082Seschrock 7912082Seschrock } else { 7922082Seschrock for (i = 0; i < count; i++, dst++, src++) { 79310105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*dst, mask); 7942082Seschrock *dst ^= *src; 7952082Seschrock } 7962082Seschrock 7972082Seschrock for (; i < xcount; i++, dst++) { 79810105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*dst, mask); 7992082Seschrock } 8002082Seschrock } 8012082Seschrock } 8022082Seschrock 8032082Seschrock src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 8042082Seschrock dst = rm->rm_col[x].rc_data; 8052082Seschrock exp = 255 - (rm->rm_cols - 1 - x); 8062082Seschrock 8072082Seschrock for (i = 0; i < xcount; i++, dst++, src++) { 8082082Seschrock *dst ^= *src; 8092082Seschrock for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 8102082Seschrock *b = vdev_raidz_exp2(*b, exp); 8112082Seschrock } 8122082Seschrock } 81310105Sadam.leventhal@sun.com 81410105Sadam.leventhal@sun.com return (1 << VDEV_RAIDZ_Q); 8152082Seschrock } 8162082Seschrock 81710105Sadam.leventhal@sun.com static int 81810105Sadam.leventhal@sun.com vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 8192082Seschrock { 8202082Seschrock uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 8212082Seschrock void *pdata, *qdata; 8222082Seschrock uint64_t xsize, ysize, i; 82310105Sadam.leventhal@sun.com int x = tgts[0]; 82410105Sadam.leventhal@sun.com int y = tgts[1]; 8252082Seschrock 82610105Sadam.leventhal@sun.com ASSERT(ntgts == 2); 8272082Seschrock ASSERT(x < y); 8282082Seschrock ASSERT(x >= rm->rm_firstdatacol); 8292082Seschrock ASSERT(y < rm->rm_cols); 8302082Seschrock 8312082Seschrock ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 8322082Seschrock 8332082Seschrock /* 8342082Seschrock * Move the parity data aside -- we're going to compute parity as 8352082Seschrock * though columns x and y were full of zeros -- Pxy and Qxy. We want to 8362082Seschrock * reuse the parity generation mechanism without trashing the actual 8372082Seschrock * parity so we make those columns appear to be full of zeros by 8382082Seschrock * setting their lengths to zero. 8392082Seschrock */ 8402082Seschrock pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 8412082Seschrock qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 8422082Seschrock xsize = rm->rm_col[x].rc_size; 8432082Seschrock ysize = rm->rm_col[y].rc_size; 8442082Seschrock 8452082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_data = 8462082Seschrock zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 8472082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_data = 8482082Seschrock zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 8492082Seschrock rm->rm_col[x].rc_size = 0; 8502082Seschrock rm->rm_col[y].rc_size = 0; 8512082Seschrock 8522082Seschrock vdev_raidz_generate_parity_pq(rm); 8532082Seschrock 8542082Seschrock rm->rm_col[x].rc_size = xsize; 8552082Seschrock rm->rm_col[y].rc_size = ysize; 8562082Seschrock 8572082Seschrock p = pdata; 8582082Seschrock q = qdata; 8592082Seschrock pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 8602082Seschrock qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 8612082Seschrock xd = rm->rm_col[x].rc_data; 8622082Seschrock yd = rm->rm_col[y].rc_data; 8632082Seschrock 8642082Seschrock /* 8652082Seschrock * We now have: 8662082Seschrock * Pxy = P + D_x + D_y 8672082Seschrock * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 8682082Seschrock * 8692082Seschrock * We can then solve for D_x: 8702082Seschrock * D_x = A * (P + Pxy) + B * (Q + Qxy) 8712082Seschrock * where 8722082Seschrock * A = 2^(x - y) * (2^(x - y) + 1)^-1 8732082Seschrock * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 8742082Seschrock * 8752082Seschrock * With D_x in hand, we can easily solve for D_y: 8762082Seschrock * D_y = P + Pxy + D_x 8772082Seschrock */ 8782082Seschrock 8792082Seschrock a = vdev_raidz_pow2[255 + x - y]; 8802082Seschrock b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 8812082Seschrock tmp = 255 - vdev_raidz_log2[a ^ 1]; 8822082Seschrock 8832082Seschrock aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 8842082Seschrock bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 8852082Seschrock 8862082Seschrock for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 8872082Seschrock *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 8882082Seschrock vdev_raidz_exp2(*q ^ *qxy, bexp); 8892082Seschrock 8902082Seschrock if (i < ysize) 8912082Seschrock *yd = *p ^ *pxy ^ *xd; 8922082Seschrock } 8932082Seschrock 8942082Seschrock zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 8952082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_size); 8962082Seschrock zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 8972082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_size); 8982082Seschrock 8992082Seschrock /* 9002082Seschrock * Restore the saved parity data. 9012082Seschrock */ 9022082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 9032082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 90410105Sadam.leventhal@sun.com 90510105Sadam.leventhal@sun.com return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 90610105Sadam.leventhal@sun.com } 90710105Sadam.leventhal@sun.com 90810105Sadam.leventhal@sun.com /* BEGIN CSTYLED */ 90910105Sadam.leventhal@sun.com /* 91010105Sadam.leventhal@sun.com * In the general case of reconstruction, we must solve the system of linear 91110105Sadam.leventhal@sun.com * equations defined by the coeffecients used to generate parity as well as 91210105Sadam.leventhal@sun.com * the contents of the data and parity disks. This can be expressed with 91310105Sadam.leventhal@sun.com * vectors for the original data (D) and the actual data (d) and parity (p) 91410105Sadam.leventhal@sun.com * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 91510105Sadam.leventhal@sun.com * 91610105Sadam.leventhal@sun.com * __ __ __ __ 91710105Sadam.leventhal@sun.com * | | __ __ | p_0 | 91810105Sadam.leventhal@sun.com * | V | | D_0 | | p_m-1 | 91910105Sadam.leventhal@sun.com * | | x | : | = | d_0 | 92010105Sadam.leventhal@sun.com * | I | | D_n-1 | | : | 92110105Sadam.leventhal@sun.com * | | ~~ ~~ | d_n-1 | 92210105Sadam.leventhal@sun.com * ~~ ~~ ~~ ~~ 92310105Sadam.leventhal@sun.com * 92410105Sadam.leventhal@sun.com * I is simply a square identity matrix of size n, and V is a vandermonde 92510105Sadam.leventhal@sun.com * matrix defined by the coeffecients we chose for the various parity columns 92610105Sadam.leventhal@sun.com * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 92710105Sadam.leventhal@sun.com * computation as well as linear separability. 92810105Sadam.leventhal@sun.com * 92910105Sadam.leventhal@sun.com * __ __ __ __ 93010105Sadam.leventhal@sun.com * | 1 .. 1 1 1 | | p_0 | 93110105Sadam.leventhal@sun.com * | 2^n-1 .. 4 2 1 | __ __ | : | 93210105Sadam.leventhal@sun.com * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 93310105Sadam.leventhal@sun.com * | 1 .. 0 0 0 | | D_1 | | d_0 | 93410105Sadam.leventhal@sun.com * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 93510105Sadam.leventhal@sun.com * | : : : : | | : | | d_2 | 93610105Sadam.leventhal@sun.com * | 0 .. 1 0 0 | | D_n-1 | | : | 93710105Sadam.leventhal@sun.com * | 0 .. 0 1 0 | ~~ ~~ | : | 93810105Sadam.leventhal@sun.com * | 0 .. 0 0 1 | | d_n-1 | 93910105Sadam.leventhal@sun.com * ~~ ~~ ~~ ~~ 94010105Sadam.leventhal@sun.com * 94110105Sadam.leventhal@sun.com * Note that I, V, d, and p are known. To compute D, we must invert the 94210105Sadam.leventhal@sun.com * matrix and use the known data and parity values to reconstruct the unknown 94310105Sadam.leventhal@sun.com * data values. We begin by removing the rows in V|I and d|p that correspond 94410105Sadam.leventhal@sun.com * to failed or missing columns; we then make V|I square (n x n) and d|p 94510105Sadam.leventhal@sun.com * sized n by removing rows corresponding to unused parity from the bottom up 94610105Sadam.leventhal@sun.com * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 94710105Sadam.leventhal@sun.com * using Gauss-Jordan elimination. In the example below we use m=3 parity 94810105Sadam.leventhal@sun.com * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 94910105Sadam.leventhal@sun.com * __ __ 95010105Sadam.leventhal@sun.com * | 1 1 1 1 1 1 1 1 | 95110105Sadam.leventhal@sun.com * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 95210105Sadam.leventhal@sun.com * | 19 205 116 29 64 16 4 1 | / / 95310105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 | / / 95410105Sadam.leventhal@sun.com * | 0 1 0 0 0 0 0 0 | <--' / 95510105Sadam.leventhal@sun.com * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 95610105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 | 95710105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 | 95810105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 | 95910105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 | 96010105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 | 96110105Sadam.leventhal@sun.com * ~~ ~~ 96210105Sadam.leventhal@sun.com * __ __ 96310105Sadam.leventhal@sun.com * | 1 1 1 1 1 1 1 1 | 96410105Sadam.leventhal@sun.com * | 128 64 32 16 8 4 2 1 | 96510105Sadam.leventhal@sun.com * | 19 205 116 29 64 16 4 1 | 96610105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 | 96710105Sadam.leventhal@sun.com * | 0 1 0 0 0 0 0 0 | 96810105Sadam.leventhal@sun.com * (V|I)' = | 0 0 1 0 0 0 0 0 | 96910105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 | 97010105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 | 97110105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 | 97210105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 | 97310105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 | 97410105Sadam.leventhal@sun.com * ~~ ~~ 97510105Sadam.leventhal@sun.com * 97610105Sadam.leventhal@sun.com * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 97710105Sadam.leventhal@sun.com * have carefully chosen the seed values 1, 2, and 4 to ensure that this 97810105Sadam.leventhal@sun.com * matrix is not singular. 97910105Sadam.leventhal@sun.com * __ __ 98010105Sadam.leventhal@sun.com * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 98110105Sadam.leventhal@sun.com * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 98210105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 98310105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 98410105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 98510105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 98610105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 98710105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 98810105Sadam.leventhal@sun.com * ~~ ~~ 98910105Sadam.leventhal@sun.com * __ __ 99010105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 99110105Sadam.leventhal@sun.com * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 99210105Sadam.leventhal@sun.com * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 99310105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 99410105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 99510105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 99610105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 99710105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 99810105Sadam.leventhal@sun.com * ~~ ~~ 99910105Sadam.leventhal@sun.com * __ __ 100010105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 100110105Sadam.leventhal@sun.com * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 100210105Sadam.leventhal@sun.com * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 100310105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 100410105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 100510105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 100610105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 100710105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 100810105Sadam.leventhal@sun.com * ~~ ~~ 100910105Sadam.leventhal@sun.com * __ __ 101010105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 101110105Sadam.leventhal@sun.com * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 101210105Sadam.leventhal@sun.com * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 101310105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 101410105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 101510105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 101610105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 101710105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 101810105Sadam.leventhal@sun.com * ~~ ~~ 101910105Sadam.leventhal@sun.com * __ __ 102010105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 102110105Sadam.leventhal@sun.com * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 102210105Sadam.leventhal@sun.com * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 102310105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 102410105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 102510105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 102610105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 102710105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 102810105Sadam.leventhal@sun.com * ~~ ~~ 102910105Sadam.leventhal@sun.com * __ __ 103010105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 103110105Sadam.leventhal@sun.com * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 103210105Sadam.leventhal@sun.com * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 103310105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 103410105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 103510105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 103610105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 103710105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 103810105Sadam.leventhal@sun.com * ~~ ~~ 103910105Sadam.leventhal@sun.com * __ __ 104010105Sadam.leventhal@sun.com * | 0 0 1 0 0 0 0 0 | 104110105Sadam.leventhal@sun.com * | 167 100 5 41 159 169 217 208 | 104210105Sadam.leventhal@sun.com * | 166 100 4 40 158 168 216 209 | 104310105Sadam.leventhal@sun.com * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 104410105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 | 104510105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 | 104610105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 | 104710105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 | 104810105Sadam.leventhal@sun.com * ~~ ~~ 104910105Sadam.leventhal@sun.com * 105010105Sadam.leventhal@sun.com * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 105110105Sadam.leventhal@sun.com * of the missing data. 105210105Sadam.leventhal@sun.com * 105310105Sadam.leventhal@sun.com * As is apparent from the example above, the only non-trivial rows in the 105410105Sadam.leventhal@sun.com * inverse matrix correspond to the data disks that we're trying to 105510105Sadam.leventhal@sun.com * reconstruct. Indeed, those are the only rows we need as the others would 105610105Sadam.leventhal@sun.com * only be useful for reconstructing data known or assumed to be valid. For 105710105Sadam.leventhal@sun.com * that reason, we only build the coefficients in the rows that correspond to 105810105Sadam.leventhal@sun.com * targeted columns. 105910105Sadam.leventhal@sun.com */ 106010105Sadam.leventhal@sun.com /* END CSTYLED */ 106110105Sadam.leventhal@sun.com 106210105Sadam.leventhal@sun.com static void 106310105Sadam.leventhal@sun.com vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 106410105Sadam.leventhal@sun.com uint8_t **rows) 106510105Sadam.leventhal@sun.com { 106610105Sadam.leventhal@sun.com int i, j; 106710105Sadam.leventhal@sun.com int pow; 106810105Sadam.leventhal@sun.com 106910105Sadam.leventhal@sun.com ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 107010105Sadam.leventhal@sun.com 107110105Sadam.leventhal@sun.com /* 107210105Sadam.leventhal@sun.com * Fill in the missing rows of interest. 107310105Sadam.leventhal@sun.com */ 107410105Sadam.leventhal@sun.com for (i = 0; i < nmap; i++) { 107510105Sadam.leventhal@sun.com ASSERT3S(0, <=, map[i]); 107610105Sadam.leventhal@sun.com ASSERT3S(map[i], <=, 2); 107710105Sadam.leventhal@sun.com 107810105Sadam.leventhal@sun.com pow = map[i] * n; 107910105Sadam.leventhal@sun.com if (pow > 255) 108010105Sadam.leventhal@sun.com pow -= 255; 108110105Sadam.leventhal@sun.com ASSERT(pow <= 255); 108210105Sadam.leventhal@sun.com 108310105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 108410105Sadam.leventhal@sun.com pow -= map[i]; 108510105Sadam.leventhal@sun.com if (pow < 0) 108610105Sadam.leventhal@sun.com pow += 255; 108710105Sadam.leventhal@sun.com rows[i][j] = vdev_raidz_pow2[pow]; 108810105Sadam.leventhal@sun.com } 108910105Sadam.leventhal@sun.com } 10902082Seschrock } 10912082Seschrock 109210105Sadam.leventhal@sun.com static void 109310105Sadam.leventhal@sun.com vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 109410105Sadam.leventhal@sun.com uint8_t **rows, uint8_t **invrows, const uint8_t *used) 109510105Sadam.leventhal@sun.com { 109610105Sadam.leventhal@sun.com int i, j, ii, jj; 109710105Sadam.leventhal@sun.com uint8_t log; 109810105Sadam.leventhal@sun.com 109910105Sadam.leventhal@sun.com /* 110010105Sadam.leventhal@sun.com * Assert that the first nmissing entries from the array of used 110110105Sadam.leventhal@sun.com * columns correspond to parity columns and that subsequent entries 110210105Sadam.leventhal@sun.com * correspond to data columns. 110310105Sadam.leventhal@sun.com */ 110410105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 110510105Sadam.leventhal@sun.com ASSERT3S(used[i], <, rm->rm_firstdatacol); 110610105Sadam.leventhal@sun.com } 110710105Sadam.leventhal@sun.com for (; i < n; i++) { 110810105Sadam.leventhal@sun.com ASSERT3S(used[i], >=, rm->rm_firstdatacol); 110910105Sadam.leventhal@sun.com } 111010105Sadam.leventhal@sun.com 111110105Sadam.leventhal@sun.com /* 111210105Sadam.leventhal@sun.com * First initialize the storage where we'll compute the inverse rows. 111310105Sadam.leventhal@sun.com */ 111410105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 111510105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 111610105Sadam.leventhal@sun.com invrows[i][j] = (i == j) ? 1 : 0; 111710105Sadam.leventhal@sun.com } 111810105Sadam.leventhal@sun.com } 111910105Sadam.leventhal@sun.com 112010105Sadam.leventhal@sun.com /* 112110105Sadam.leventhal@sun.com * Subtract all trivial rows from the rows of consequence. 112210105Sadam.leventhal@sun.com */ 112310105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 112410105Sadam.leventhal@sun.com for (j = nmissing; j < n; j++) { 112510105Sadam.leventhal@sun.com ASSERT3U(used[j], >=, rm->rm_firstdatacol); 112610105Sadam.leventhal@sun.com jj = used[j] - rm->rm_firstdatacol; 112710105Sadam.leventhal@sun.com ASSERT3S(jj, <, n); 112810105Sadam.leventhal@sun.com invrows[i][j] = rows[i][jj]; 112910105Sadam.leventhal@sun.com rows[i][jj] = 0; 113010105Sadam.leventhal@sun.com } 113110105Sadam.leventhal@sun.com } 113210105Sadam.leventhal@sun.com 113310105Sadam.leventhal@sun.com /* 113410105Sadam.leventhal@sun.com * For each of the rows of interest, we must normalize it and subtract 113510105Sadam.leventhal@sun.com * a multiple of it from the other rows. 113610105Sadam.leventhal@sun.com */ 113710105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 113810105Sadam.leventhal@sun.com for (j = 0; j < missing[i]; j++) { 113910105Sadam.leventhal@sun.com ASSERT3U(rows[i][j], ==, 0); 114010105Sadam.leventhal@sun.com } 114110105Sadam.leventhal@sun.com ASSERT3U(rows[i][missing[i]], !=, 0); 114210105Sadam.leventhal@sun.com 114310105Sadam.leventhal@sun.com /* 114410105Sadam.leventhal@sun.com * Compute the inverse of the first element and multiply each 114510105Sadam.leventhal@sun.com * element in the row by that value. 114610105Sadam.leventhal@sun.com */ 114710105Sadam.leventhal@sun.com log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 114810105Sadam.leventhal@sun.com 114910105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 115010105Sadam.leventhal@sun.com rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 115110105Sadam.leventhal@sun.com invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 115210105Sadam.leventhal@sun.com } 115310105Sadam.leventhal@sun.com 115410105Sadam.leventhal@sun.com for (ii = 0; ii < nmissing; ii++) { 115510105Sadam.leventhal@sun.com if (i == ii) 115610105Sadam.leventhal@sun.com continue; 115710105Sadam.leventhal@sun.com 115810105Sadam.leventhal@sun.com ASSERT3U(rows[ii][missing[i]], !=, 0); 115910105Sadam.leventhal@sun.com 116010105Sadam.leventhal@sun.com log = vdev_raidz_log2[rows[ii][missing[i]]]; 116110105Sadam.leventhal@sun.com 116210105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 116310105Sadam.leventhal@sun.com rows[ii][j] ^= 116410105Sadam.leventhal@sun.com vdev_raidz_exp2(rows[i][j], log); 116510105Sadam.leventhal@sun.com invrows[ii][j] ^= 116610105Sadam.leventhal@sun.com vdev_raidz_exp2(invrows[i][j], log); 116710105Sadam.leventhal@sun.com } 116810105Sadam.leventhal@sun.com } 116910105Sadam.leventhal@sun.com } 117010105Sadam.leventhal@sun.com 117110105Sadam.leventhal@sun.com /* 117210105Sadam.leventhal@sun.com * Verify that the data that is left in the rows are properly part of 117310105Sadam.leventhal@sun.com * an identity matrix. 117410105Sadam.leventhal@sun.com */ 117510105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 117610105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 117710105Sadam.leventhal@sun.com if (j == missing[i]) { 117810105Sadam.leventhal@sun.com ASSERT3U(rows[i][j], ==, 1); 117910105Sadam.leventhal@sun.com } else { 118010105Sadam.leventhal@sun.com ASSERT3U(rows[i][j], ==, 0); 118110105Sadam.leventhal@sun.com } 118210105Sadam.leventhal@sun.com } 118310105Sadam.leventhal@sun.com } 118410105Sadam.leventhal@sun.com } 118510105Sadam.leventhal@sun.com 118610105Sadam.leventhal@sun.com static void 118710105Sadam.leventhal@sun.com vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 118810105Sadam.leventhal@sun.com int *missing, uint8_t **invrows, const uint8_t *used) 118910105Sadam.leventhal@sun.com { 119010105Sadam.leventhal@sun.com int i, j, x, cc, c; 119110105Sadam.leventhal@sun.com uint8_t *src; 119210105Sadam.leventhal@sun.com uint64_t ccount; 119310105Sadam.leventhal@sun.com uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 119410105Sadam.leventhal@sun.com uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 119510105Sadam.leventhal@sun.com uint8_t log, val; 119610105Sadam.leventhal@sun.com int ll; 119710105Sadam.leventhal@sun.com uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 119810105Sadam.leventhal@sun.com uint8_t *p, *pp; 119910105Sadam.leventhal@sun.com size_t psize; 120010105Sadam.leventhal@sun.com 120110105Sadam.leventhal@sun.com psize = sizeof (invlog[0][0]) * n * nmissing; 120210105Sadam.leventhal@sun.com p = kmem_alloc(psize, KM_SLEEP); 120310105Sadam.leventhal@sun.com 120410105Sadam.leventhal@sun.com for (pp = p, i = 0; i < nmissing; i++) { 120510105Sadam.leventhal@sun.com invlog[i] = pp; 120610105Sadam.leventhal@sun.com pp += n; 120710105Sadam.leventhal@sun.com } 120810105Sadam.leventhal@sun.com 120910105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 121010105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 121110105Sadam.leventhal@sun.com ASSERT3U(invrows[i][j], !=, 0); 121210105Sadam.leventhal@sun.com invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 121310105Sadam.leventhal@sun.com } 121410105Sadam.leventhal@sun.com } 121510105Sadam.leventhal@sun.com 121610105Sadam.leventhal@sun.com for (i = 0; i < n; i++) { 121710105Sadam.leventhal@sun.com c = used[i]; 121810105Sadam.leventhal@sun.com ASSERT3U(c, <, rm->rm_cols); 121910105Sadam.leventhal@sun.com 122010105Sadam.leventhal@sun.com src = rm->rm_col[c].rc_data; 122110105Sadam.leventhal@sun.com ccount = rm->rm_col[c].rc_size; 122210105Sadam.leventhal@sun.com for (j = 0; j < nmissing; j++) { 122310105Sadam.leventhal@sun.com cc = missing[j] + rm->rm_firstdatacol; 122410105Sadam.leventhal@sun.com ASSERT3U(cc, >=, rm->rm_firstdatacol); 122510105Sadam.leventhal@sun.com ASSERT3U(cc, <, rm->rm_cols); 122610105Sadam.leventhal@sun.com ASSERT3U(cc, !=, c); 122710105Sadam.leventhal@sun.com 122810105Sadam.leventhal@sun.com dst[j] = rm->rm_col[cc].rc_data; 122910105Sadam.leventhal@sun.com dcount[j] = rm->rm_col[cc].rc_size; 123010105Sadam.leventhal@sun.com } 123110105Sadam.leventhal@sun.com 123210105Sadam.leventhal@sun.com ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 123310105Sadam.leventhal@sun.com 123410105Sadam.leventhal@sun.com for (x = 0; x < ccount; x++, src++) { 123510105Sadam.leventhal@sun.com if (*src != 0) 123610105Sadam.leventhal@sun.com log = vdev_raidz_log2[*src]; 123710105Sadam.leventhal@sun.com 123810105Sadam.leventhal@sun.com for (cc = 0; cc < nmissing; cc++) { 123910105Sadam.leventhal@sun.com if (x >= dcount[cc]) 124010105Sadam.leventhal@sun.com continue; 124110105Sadam.leventhal@sun.com 124210105Sadam.leventhal@sun.com if (*src == 0) { 124310105Sadam.leventhal@sun.com val = 0; 124410105Sadam.leventhal@sun.com } else { 124510105Sadam.leventhal@sun.com if ((ll = log + invlog[cc][i]) >= 255) 124610105Sadam.leventhal@sun.com ll -= 255; 124710105Sadam.leventhal@sun.com val = vdev_raidz_pow2[ll]; 124810105Sadam.leventhal@sun.com } 124910105Sadam.leventhal@sun.com 125010105Sadam.leventhal@sun.com if (i == 0) 125110105Sadam.leventhal@sun.com dst[cc][x] = val; 125210105Sadam.leventhal@sun.com else 125310105Sadam.leventhal@sun.com dst[cc][x] ^= val; 125410105Sadam.leventhal@sun.com } 125510105Sadam.leventhal@sun.com } 125610105Sadam.leventhal@sun.com } 125710105Sadam.leventhal@sun.com 125810105Sadam.leventhal@sun.com kmem_free(p, psize); 125910105Sadam.leventhal@sun.com } 126010105Sadam.leventhal@sun.com 126110105Sadam.leventhal@sun.com static int 126210105Sadam.leventhal@sun.com vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 126310105Sadam.leventhal@sun.com { 126410105Sadam.leventhal@sun.com int n, i, c, t, tt; 126510105Sadam.leventhal@sun.com int nmissing_rows; 126610105Sadam.leventhal@sun.com int missing_rows[VDEV_RAIDZ_MAXPARITY]; 126710105Sadam.leventhal@sun.com int parity_map[VDEV_RAIDZ_MAXPARITY]; 126810105Sadam.leventhal@sun.com 126910105Sadam.leventhal@sun.com uint8_t *p, *pp; 127010105Sadam.leventhal@sun.com size_t psize; 127110105Sadam.leventhal@sun.com 127210105Sadam.leventhal@sun.com uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 127310105Sadam.leventhal@sun.com uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 127410105Sadam.leventhal@sun.com uint8_t *used; 127510105Sadam.leventhal@sun.com 127610105Sadam.leventhal@sun.com int code = 0; 127710105Sadam.leventhal@sun.com 127810105Sadam.leventhal@sun.com 127910105Sadam.leventhal@sun.com n = rm->rm_cols - rm->rm_firstdatacol; 128010105Sadam.leventhal@sun.com 128110105Sadam.leventhal@sun.com /* 128210105Sadam.leventhal@sun.com * Figure out which data columns are missing. 128310105Sadam.leventhal@sun.com */ 128410105Sadam.leventhal@sun.com nmissing_rows = 0; 128510105Sadam.leventhal@sun.com for (t = 0; t < ntgts; t++) { 128610105Sadam.leventhal@sun.com if (tgts[t] >= rm->rm_firstdatacol) { 128710105Sadam.leventhal@sun.com missing_rows[nmissing_rows++] = 128810105Sadam.leventhal@sun.com tgts[t] - rm->rm_firstdatacol; 128910105Sadam.leventhal@sun.com } 129010105Sadam.leventhal@sun.com } 129110105Sadam.leventhal@sun.com 129210105Sadam.leventhal@sun.com /* 129310105Sadam.leventhal@sun.com * Figure out which parity columns to use to help generate the missing 129410105Sadam.leventhal@sun.com * data columns. 129510105Sadam.leventhal@sun.com */ 129610105Sadam.leventhal@sun.com for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 129710105Sadam.leventhal@sun.com ASSERT(tt < ntgts); 129810105Sadam.leventhal@sun.com ASSERT(c < rm->rm_firstdatacol); 129910105Sadam.leventhal@sun.com 130010105Sadam.leventhal@sun.com /* 130110105Sadam.leventhal@sun.com * Skip any targeted parity columns. 130210105Sadam.leventhal@sun.com */ 130310105Sadam.leventhal@sun.com if (c == tgts[tt]) { 130410105Sadam.leventhal@sun.com tt++; 130510105Sadam.leventhal@sun.com continue; 130610105Sadam.leventhal@sun.com } 130710105Sadam.leventhal@sun.com 130810105Sadam.leventhal@sun.com code |= 1 << c; 130910105Sadam.leventhal@sun.com 131010105Sadam.leventhal@sun.com parity_map[i] = c; 131110105Sadam.leventhal@sun.com i++; 131210105Sadam.leventhal@sun.com } 131310105Sadam.leventhal@sun.com 131410105Sadam.leventhal@sun.com ASSERT(code != 0); 131510105Sadam.leventhal@sun.com ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 131610105Sadam.leventhal@sun.com 131710105Sadam.leventhal@sun.com psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 131810105Sadam.leventhal@sun.com nmissing_rows * n + sizeof (used[0]) * n; 131910105Sadam.leventhal@sun.com p = kmem_alloc(psize, KM_SLEEP); 132010105Sadam.leventhal@sun.com 132110105Sadam.leventhal@sun.com for (pp = p, i = 0; i < nmissing_rows; i++) { 132210105Sadam.leventhal@sun.com rows[i] = pp; 132310105Sadam.leventhal@sun.com pp += n; 132410105Sadam.leventhal@sun.com invrows[i] = pp; 132510105Sadam.leventhal@sun.com pp += n; 132610105Sadam.leventhal@sun.com } 132710105Sadam.leventhal@sun.com used = pp; 132810105Sadam.leventhal@sun.com 132910105Sadam.leventhal@sun.com for (i = 0; i < nmissing_rows; i++) { 133010105Sadam.leventhal@sun.com used[i] = parity_map[i]; 133110105Sadam.leventhal@sun.com } 133210105Sadam.leventhal@sun.com 133310105Sadam.leventhal@sun.com for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 133410105Sadam.leventhal@sun.com if (tt < nmissing_rows && 133510105Sadam.leventhal@sun.com c == missing_rows[tt] + rm->rm_firstdatacol) { 133610105Sadam.leventhal@sun.com tt++; 133710105Sadam.leventhal@sun.com continue; 133810105Sadam.leventhal@sun.com } 133910105Sadam.leventhal@sun.com 134010105Sadam.leventhal@sun.com ASSERT3S(i, <, n); 134110105Sadam.leventhal@sun.com used[i] = c; 134210105Sadam.leventhal@sun.com i++; 134310105Sadam.leventhal@sun.com } 134410105Sadam.leventhal@sun.com 134510105Sadam.leventhal@sun.com /* 134610105Sadam.leventhal@sun.com * Initialize the interesting rows of the matrix. 134710105Sadam.leventhal@sun.com */ 134810105Sadam.leventhal@sun.com vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 134910105Sadam.leventhal@sun.com 135010105Sadam.leventhal@sun.com /* 135110105Sadam.leventhal@sun.com * Invert the matrix. 135210105Sadam.leventhal@sun.com */ 135310105Sadam.leventhal@sun.com vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 135410105Sadam.leventhal@sun.com invrows, used); 135510105Sadam.leventhal@sun.com 135610105Sadam.leventhal@sun.com /* 135710105Sadam.leventhal@sun.com * Reconstruct the missing data using the generated matrix. 135810105Sadam.leventhal@sun.com */ 135910105Sadam.leventhal@sun.com vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 136010105Sadam.leventhal@sun.com invrows, used); 136110105Sadam.leventhal@sun.com 136210105Sadam.leventhal@sun.com kmem_free(p, psize); 136310105Sadam.leventhal@sun.com 136410105Sadam.leventhal@sun.com return (code); 136510105Sadam.leventhal@sun.com } 136610105Sadam.leventhal@sun.com 136710105Sadam.leventhal@sun.com static int 136810105Sadam.leventhal@sun.com vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 136910105Sadam.leventhal@sun.com { 137010105Sadam.leventhal@sun.com int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 137110105Sadam.leventhal@sun.com int ntgts; 137210105Sadam.leventhal@sun.com int i, c; 137310105Sadam.leventhal@sun.com int code; 137410105Sadam.leventhal@sun.com int nbadparity, nbaddata; 137510105Sadam.leventhal@sun.com int parity_valid[VDEV_RAIDZ_MAXPARITY]; 137610105Sadam.leventhal@sun.com 137710105Sadam.leventhal@sun.com /* 137810105Sadam.leventhal@sun.com * The tgts list must already be sorted. 137910105Sadam.leventhal@sun.com */ 138010105Sadam.leventhal@sun.com for (i = 1; i < nt; i++) { 138110105Sadam.leventhal@sun.com ASSERT(t[i] > t[i - 1]); 138210105Sadam.leventhal@sun.com } 138310105Sadam.leventhal@sun.com 138410105Sadam.leventhal@sun.com nbadparity = rm->rm_firstdatacol; 138510105Sadam.leventhal@sun.com nbaddata = rm->rm_cols - nbadparity; 138610105Sadam.leventhal@sun.com ntgts = 0; 138710105Sadam.leventhal@sun.com for (i = 0, c = 0; c < rm->rm_cols; c++) { 138810105Sadam.leventhal@sun.com if (c < rm->rm_firstdatacol) 138910105Sadam.leventhal@sun.com parity_valid[c] = B_FALSE; 139010105Sadam.leventhal@sun.com 139110105Sadam.leventhal@sun.com if (i < nt && c == t[i]) { 139210105Sadam.leventhal@sun.com tgts[ntgts++] = c; 139310105Sadam.leventhal@sun.com i++; 139410105Sadam.leventhal@sun.com } else if (rm->rm_col[c].rc_error != 0) { 139510105Sadam.leventhal@sun.com tgts[ntgts++] = c; 139610105Sadam.leventhal@sun.com } else if (c >= rm->rm_firstdatacol) { 139710105Sadam.leventhal@sun.com nbaddata--; 139810105Sadam.leventhal@sun.com } else { 139910105Sadam.leventhal@sun.com parity_valid[c] = B_TRUE; 140010105Sadam.leventhal@sun.com nbadparity--; 140110105Sadam.leventhal@sun.com } 140210105Sadam.leventhal@sun.com } 140310105Sadam.leventhal@sun.com 140410105Sadam.leventhal@sun.com ASSERT(ntgts >= nt); 140510105Sadam.leventhal@sun.com ASSERT(nbaddata >= 0); 140610105Sadam.leventhal@sun.com ASSERT(nbaddata + nbadparity == ntgts); 140710105Sadam.leventhal@sun.com 140810105Sadam.leventhal@sun.com dt = &tgts[nbadparity]; 140910105Sadam.leventhal@sun.com 141010105Sadam.leventhal@sun.com /* 141110105Sadam.leventhal@sun.com * See if we can use any of our optimized reconstruction routines. 141210105Sadam.leventhal@sun.com */ 141310105Sadam.leventhal@sun.com if (!vdev_raidz_default_to_general) { 141410105Sadam.leventhal@sun.com switch (nbaddata) { 141510105Sadam.leventhal@sun.com case 1: 141610105Sadam.leventhal@sun.com if (parity_valid[VDEV_RAIDZ_P]) 141710105Sadam.leventhal@sun.com return (vdev_raidz_reconstruct_p(rm, dt, 1)); 141810105Sadam.leventhal@sun.com 141910105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol > 1); 142010105Sadam.leventhal@sun.com 142110105Sadam.leventhal@sun.com if (parity_valid[VDEV_RAIDZ_Q]) 142210105Sadam.leventhal@sun.com return (vdev_raidz_reconstruct_q(rm, dt, 1)); 142310105Sadam.leventhal@sun.com 142410105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol > 2); 142510105Sadam.leventhal@sun.com break; 142610105Sadam.leventhal@sun.com 142710105Sadam.leventhal@sun.com case 2: 142810105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol > 1); 142910105Sadam.leventhal@sun.com 143010105Sadam.leventhal@sun.com if (parity_valid[VDEV_RAIDZ_P] && 143110105Sadam.leventhal@sun.com parity_valid[VDEV_RAIDZ_Q]) 143210105Sadam.leventhal@sun.com return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 143310105Sadam.leventhal@sun.com 143410105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol > 2); 143510105Sadam.leventhal@sun.com 143610105Sadam.leventhal@sun.com break; 143710105Sadam.leventhal@sun.com } 143810105Sadam.leventhal@sun.com } 143910105Sadam.leventhal@sun.com 144010105Sadam.leventhal@sun.com code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 144110105Sadam.leventhal@sun.com ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 144210105Sadam.leventhal@sun.com ASSERT(code > 0); 144310105Sadam.leventhal@sun.com return (code); 144410105Sadam.leventhal@sun.com } 14452082Seschrock 1446789Sahrens static int 1447789Sahrens vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) 1448789Sahrens { 144910105Sadam.leventhal@sun.com vdev_t *cvd; 14502082Seschrock uint64_t nparity = vd->vdev_nparity; 145110105Sadam.leventhal@sun.com int c; 1452789Sahrens int lasterror = 0; 1453789Sahrens int numerrors = 0; 1454789Sahrens 14552082Seschrock ASSERT(nparity > 0); 14562082Seschrock 14572082Seschrock if (nparity > VDEV_RAIDZ_MAXPARITY || 14582082Seschrock vd->vdev_children < nparity + 1) { 1459789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1460789Sahrens return (EINVAL); 1461789Sahrens } 1462789Sahrens 14639846SEric.Taylor@Sun.COM vdev_open_children(vd); 1464789Sahrens 146510105Sadam.leventhal@sun.com for (c = 0; c < vd->vdev_children; c++) { 146610105Sadam.leventhal@sun.com cvd = vd->vdev_child[c]; 14679846SEric.Taylor@Sun.COM 146810105Sadam.leventhal@sun.com if (cvd->vdev_open_error != 0) { 14699846SEric.Taylor@Sun.COM lasterror = cvd->vdev_open_error; 1470789Sahrens numerrors++; 1471789Sahrens continue; 1472789Sahrens } 1473789Sahrens 1474789Sahrens *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 14751732Sbonwick *ashift = MAX(*ashift, cvd->vdev_ashift); 1476789Sahrens } 1477789Sahrens 1478789Sahrens *asize *= vd->vdev_children; 1479789Sahrens 14802082Seschrock if (numerrors > nparity) { 1481789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1482789Sahrens return (lasterror); 1483789Sahrens } 1484789Sahrens 1485789Sahrens return (0); 1486789Sahrens } 1487789Sahrens 1488789Sahrens static void 1489789Sahrens vdev_raidz_close(vdev_t *vd) 1490789Sahrens { 149110105Sadam.leventhal@sun.com int c; 149210105Sadam.leventhal@sun.com 149310105Sadam.leventhal@sun.com for (c = 0; c < vd->vdev_children; c++) 1494789Sahrens vdev_close(vd->vdev_child[c]); 1495789Sahrens } 1496789Sahrens 1497789Sahrens static uint64_t 1498789Sahrens vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1499789Sahrens { 1500789Sahrens uint64_t asize; 15011732Sbonwick uint64_t ashift = vd->vdev_top->vdev_ashift; 1502789Sahrens uint64_t cols = vd->vdev_children; 15032082Seschrock uint64_t nparity = vd->vdev_nparity; 1504789Sahrens 15051732Sbonwick asize = ((psize - 1) >> ashift) + 1; 15062082Seschrock asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 15072082Seschrock asize = roundup(asize, nparity + 1) << ashift; 1508789Sahrens 1509789Sahrens return (asize); 1510789Sahrens } 1511789Sahrens 1512789Sahrens static void 1513789Sahrens vdev_raidz_child_done(zio_t *zio) 1514789Sahrens { 1515789Sahrens raidz_col_t *rc = zio->io_private; 1516789Sahrens 1517789Sahrens rc->rc_error = zio->io_error; 1518789Sahrens rc->rc_tried = 1; 1519789Sahrens rc->rc_skipped = 0; 1520789Sahrens } 1521789Sahrens 15225530Sbonwick static int 1523789Sahrens vdev_raidz_io_start(zio_t *zio) 1524789Sahrens { 1525789Sahrens vdev_t *vd = zio->io_vd; 15261732Sbonwick vdev_t *tvd = vd->vdev_top; 1527789Sahrens vdev_t *cvd; 1528789Sahrens blkptr_t *bp = zio->io_bp; 1529789Sahrens raidz_map_t *rm; 1530789Sahrens raidz_col_t *rc; 153110105Sadam.leventhal@sun.com int c, i; 1532789Sahrens 15332082Seschrock rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, 15342082Seschrock vd->vdev_nparity); 1535789Sahrens 15361775Sbillm ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1537789Sahrens 1538789Sahrens if (zio->io_type == ZIO_TYPE_WRITE) { 153910105Sadam.leventhal@sun.com vdev_raidz_generate_parity(rm); 1540789Sahrens 1541789Sahrens for (c = 0; c < rm->rm_cols; c++) { 1542789Sahrens rc = &rm->rm_col[c]; 15432082Seschrock cvd = vd->vdev_child[rc->rc_devidx]; 1544789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1545789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 15467754SJeff.Bonwick@Sun.COM zio->io_type, zio->io_priority, 0, 1547789Sahrens vdev_raidz_child_done, rc)); 1548789Sahrens } 15495530Sbonwick 155010105Sadam.leventhal@sun.com /* 155110105Sadam.leventhal@sun.com * Generate optional I/Os for any skipped sectors to improve 155210105Sadam.leventhal@sun.com * aggregation contiguity. 155310105Sadam.leventhal@sun.com */ 155410450Sadam.leventhal@sun.com for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 155510105Sadam.leventhal@sun.com ASSERT(c <= rm->rm_scols); 155610105Sadam.leventhal@sun.com if (c == rm->rm_scols) 155710105Sadam.leventhal@sun.com c = 0; 155810105Sadam.leventhal@sun.com rc = &rm->rm_col[c]; 155910105Sadam.leventhal@sun.com cvd = vd->vdev_child[rc->rc_devidx]; 156010105Sadam.leventhal@sun.com zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 156110105Sadam.leventhal@sun.com rc->rc_offset + rc->rc_size, NULL, 156210105Sadam.leventhal@sun.com 1 << tvd->vdev_ashift, 156310105Sadam.leventhal@sun.com zio->io_type, zio->io_priority, 156410105Sadam.leventhal@sun.com ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 156510105Sadam.leventhal@sun.com } 156610105Sadam.leventhal@sun.com 15677754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 1568789Sahrens } 1569789Sahrens 1570789Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ); 1571789Sahrens 15722082Seschrock /* 15732082Seschrock * Iterate over the columns in reverse order so that we hit the parity 157410105Sadam.leventhal@sun.com * last -- any errors along the way will force us to read the parity. 15752082Seschrock */ 1576789Sahrens for (c = rm->rm_cols - 1; c >= 0; c--) { 1577789Sahrens rc = &rm->rm_col[c]; 15782082Seschrock cvd = vd->vdev_child[rc->rc_devidx]; 15795329Sgw25295 if (!vdev_readable(cvd)) { 15802082Seschrock if (c >= rm->rm_firstdatacol) 15812082Seschrock rm->rm_missingdata++; 15822082Seschrock else 15832082Seschrock rm->rm_missingparity++; 1584789Sahrens rc->rc_error = ENXIO; 1585789Sahrens rc->rc_tried = 1; /* don't even try */ 1586789Sahrens rc->rc_skipped = 1; 1587789Sahrens continue; 1588789Sahrens } 15898241SJeff.Bonwick@Sun.COM if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) { 15902082Seschrock if (c >= rm->rm_firstdatacol) 15912082Seschrock rm->rm_missingdata++; 15922082Seschrock else 15932082Seschrock rm->rm_missingparity++; 1594789Sahrens rc->rc_error = ESTALE; 1595789Sahrens rc->rc_skipped = 1; 1596789Sahrens continue; 1597789Sahrens } 15982082Seschrock if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 15999434SMark.Musante@Sun.COM (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1600789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1601789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 16027754SJeff.Bonwick@Sun.COM zio->io_type, zio->io_priority, 0, 1603789Sahrens vdev_raidz_child_done, rc)); 1604789Sahrens } 1605789Sahrens } 1606789Sahrens 16077754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 1608789Sahrens } 1609789Sahrens 16101544Seschrock /* 16111544Seschrock * Report a checksum error for a child of a RAID-Z device. 16121544Seschrock */ 16131544Seschrock static void 1614*10614SJonathan.Adams@Sun.COM raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) 16151544Seschrock { 16162082Seschrock vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 16171544Seschrock 16181544Seschrock if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1619*10614SJonathan.Adams@Sun.COM zio_bad_cksum_t zbc; 1620*10614SJonathan.Adams@Sun.COM raidz_map_t *rm = zio->io_vsd; 1621*10614SJonathan.Adams@Sun.COM 16221544Seschrock mutex_enter(&vd->vdev_stat_lock); 16231544Seschrock vd->vdev_stat.vs_checksum_errors++; 16241544Seschrock mutex_exit(&vd->vdev_stat_lock); 1625*10614SJonathan.Adams@Sun.COM 1626*10614SJonathan.Adams@Sun.COM zbc.zbc_has_cksum = 0; 1627*10614SJonathan.Adams@Sun.COM zbc.zbc_injected = rm->rm_ecksuminjected; 1628*10614SJonathan.Adams@Sun.COM 1629*10614SJonathan.Adams@Sun.COM zfs_ereport_post_checksum(zio->io_spa, vd, zio, 1630*10614SJonathan.Adams@Sun.COM rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, 1631*10614SJonathan.Adams@Sun.COM &zbc); 16321544Seschrock } 1633*10614SJonathan.Adams@Sun.COM } 16341544Seschrock 1635*10614SJonathan.Adams@Sun.COM /* 1636*10614SJonathan.Adams@Sun.COM * We keep track of whether or not there were any injected errors, so that 1637*10614SJonathan.Adams@Sun.COM * any ereports we generate can note it. 1638*10614SJonathan.Adams@Sun.COM */ 1639*10614SJonathan.Adams@Sun.COM static int 1640*10614SJonathan.Adams@Sun.COM raidz_checksum_verify(zio_t *zio) 1641*10614SJonathan.Adams@Sun.COM { 1642*10614SJonathan.Adams@Sun.COM zio_bad_cksum_t zbc; 1643*10614SJonathan.Adams@Sun.COM raidz_map_t *rm = zio->io_vsd; 1644*10614SJonathan.Adams@Sun.COM 1645*10614SJonathan.Adams@Sun.COM int ret = zio_checksum_error(zio, &zbc); 1646*10614SJonathan.Adams@Sun.COM if (ret != 0 && zbc.zbc_injected != 0) 1647*10614SJonathan.Adams@Sun.COM rm->rm_ecksuminjected = 1; 1648*10614SJonathan.Adams@Sun.COM 1649*10614SJonathan.Adams@Sun.COM return (ret); 16501544Seschrock } 16511544Seschrock 16522082Seschrock /* 16532082Seschrock * Generate the parity from the data columns. If we tried and were able to 16542082Seschrock * read the parity without error, verify that the generated parity matches the 16552082Seschrock * data we read. If it doesn't, we fire off a checksum error. Return the 16562082Seschrock * number such failures. 16572082Seschrock */ 16582082Seschrock static int 16592082Seschrock raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 16602082Seschrock { 16612082Seschrock void *orig[VDEV_RAIDZ_MAXPARITY]; 16622082Seschrock int c, ret = 0; 16632082Seschrock raidz_col_t *rc; 16642082Seschrock 16652082Seschrock for (c = 0; c < rm->rm_firstdatacol; c++) { 16662082Seschrock rc = &rm->rm_col[c]; 16672082Seschrock if (!rc->rc_tried || rc->rc_error != 0) 16682082Seschrock continue; 16692082Seschrock orig[c] = zio_buf_alloc(rc->rc_size); 16702082Seschrock bcopy(rc->rc_data, orig[c], rc->rc_size); 16712082Seschrock } 16722082Seschrock 167310105Sadam.leventhal@sun.com vdev_raidz_generate_parity(rm); 16742082Seschrock 16752082Seschrock for (c = 0; c < rm->rm_firstdatacol; c++) { 16762082Seschrock rc = &rm->rm_col[c]; 16772082Seschrock if (!rc->rc_tried || rc->rc_error != 0) 16782082Seschrock continue; 16792082Seschrock if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1680*10614SJonathan.Adams@Sun.COM raidz_checksum_error(zio, rc, orig[c]); 16812082Seschrock rc->rc_error = ECKSUM; 16822082Seschrock ret++; 16832082Seschrock } 16842082Seschrock zio_buf_free(orig[c], rc->rc_size); 16852082Seschrock } 16862082Seschrock 16872082Seschrock return (ret); 16882082Seschrock } 16892082Seschrock 169010105Sadam.leventhal@sun.com /* 169110105Sadam.leventhal@sun.com * Keep statistics on all the ways that we used parity to correct data. 169210105Sadam.leventhal@sun.com */ 169310105Sadam.leventhal@sun.com static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; 16941544Seschrock 16955530Sbonwick static int 16967754SJeff.Bonwick@Sun.COM vdev_raidz_worst_error(raidz_map_t *rm) 16977754SJeff.Bonwick@Sun.COM { 16987754SJeff.Bonwick@Sun.COM int error = 0; 16997754SJeff.Bonwick@Sun.COM 17007754SJeff.Bonwick@Sun.COM for (int c = 0; c < rm->rm_cols; c++) 17017754SJeff.Bonwick@Sun.COM error = zio_worst_error(error, rm->rm_col[c].rc_error); 17027754SJeff.Bonwick@Sun.COM 17037754SJeff.Bonwick@Sun.COM return (error); 17047754SJeff.Bonwick@Sun.COM } 17057754SJeff.Bonwick@Sun.COM 170610105Sadam.leventhal@sun.com /* 170710105Sadam.leventhal@sun.com * Iterate over all combinations of bad data and attempt a reconstruction. 170810105Sadam.leventhal@sun.com * Note that the algorithm below is non-optimal because it doesn't take into 170910105Sadam.leventhal@sun.com * account how reconstruction is actually performed. For example, with 171010105Sadam.leventhal@sun.com * triple-parity RAID-Z the reconstruction procedure is the same if column 4 171110105Sadam.leventhal@sun.com * is targeted as invalid as if columns 1 and 4 are targeted since in both 171210105Sadam.leventhal@sun.com * cases we'd only use parity information in column 0. 171310105Sadam.leventhal@sun.com */ 171410105Sadam.leventhal@sun.com static int 171510105Sadam.leventhal@sun.com vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 171610105Sadam.leventhal@sun.com { 171710105Sadam.leventhal@sun.com raidz_map_t *rm = zio->io_vsd; 171810105Sadam.leventhal@sun.com raidz_col_t *rc; 171910105Sadam.leventhal@sun.com void *orig[VDEV_RAIDZ_MAXPARITY]; 172010105Sadam.leventhal@sun.com int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 172110105Sadam.leventhal@sun.com int *tgts = &tstore[1]; 172210105Sadam.leventhal@sun.com int current, next, i, c, n; 172310105Sadam.leventhal@sun.com int code, ret = 0; 172410105Sadam.leventhal@sun.com 172510105Sadam.leventhal@sun.com ASSERT(total_errors < rm->rm_firstdatacol); 172610105Sadam.leventhal@sun.com 172710105Sadam.leventhal@sun.com /* 172810105Sadam.leventhal@sun.com * This simplifies one edge condition. 172910105Sadam.leventhal@sun.com */ 173010105Sadam.leventhal@sun.com tgts[-1] = -1; 173110105Sadam.leventhal@sun.com 173210105Sadam.leventhal@sun.com for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 173310105Sadam.leventhal@sun.com /* 173410105Sadam.leventhal@sun.com * Initialize the targets array by finding the first n columns 173510105Sadam.leventhal@sun.com * that contain no error. 173610105Sadam.leventhal@sun.com * 173710105Sadam.leventhal@sun.com * If there were no data errors, we need to ensure that we're 173810105Sadam.leventhal@sun.com * always explicitly attempting to reconstruct at least one 173910105Sadam.leventhal@sun.com * data column. To do this, we simply push the highest target 174010105Sadam.leventhal@sun.com * up into the data columns. 174110105Sadam.leventhal@sun.com */ 174210105Sadam.leventhal@sun.com for (c = 0, i = 0; i < n; i++) { 174310105Sadam.leventhal@sun.com if (i == n - 1 && data_errors == 0 && 174410105Sadam.leventhal@sun.com c < rm->rm_firstdatacol) { 174510105Sadam.leventhal@sun.com c = rm->rm_firstdatacol; 174610105Sadam.leventhal@sun.com } 174710105Sadam.leventhal@sun.com 174810105Sadam.leventhal@sun.com while (rm->rm_col[c].rc_error != 0) { 174910105Sadam.leventhal@sun.com c++; 175010105Sadam.leventhal@sun.com ASSERT3S(c, <, rm->rm_cols); 175110105Sadam.leventhal@sun.com } 175210105Sadam.leventhal@sun.com 175310105Sadam.leventhal@sun.com tgts[i] = c++; 175410105Sadam.leventhal@sun.com } 175510105Sadam.leventhal@sun.com 175610105Sadam.leventhal@sun.com /* 175710105Sadam.leventhal@sun.com * Setting tgts[n] simplifies the other edge condition. 175810105Sadam.leventhal@sun.com */ 175910105Sadam.leventhal@sun.com tgts[n] = rm->rm_cols; 176010105Sadam.leventhal@sun.com 176110105Sadam.leventhal@sun.com /* 176210105Sadam.leventhal@sun.com * These buffers were allocated in previous iterations. 176310105Sadam.leventhal@sun.com */ 176410105Sadam.leventhal@sun.com for (i = 0; i < n - 1; i++) { 176510105Sadam.leventhal@sun.com ASSERT(orig[i] != NULL); 176610105Sadam.leventhal@sun.com } 176710105Sadam.leventhal@sun.com 176810105Sadam.leventhal@sun.com orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); 176910105Sadam.leventhal@sun.com 177010105Sadam.leventhal@sun.com current = 0; 177110105Sadam.leventhal@sun.com next = tgts[current]; 177210105Sadam.leventhal@sun.com 177310105Sadam.leventhal@sun.com while (current != n) { 177410105Sadam.leventhal@sun.com tgts[current] = next; 177510105Sadam.leventhal@sun.com current = 0; 177610105Sadam.leventhal@sun.com 177710105Sadam.leventhal@sun.com /* 177810105Sadam.leventhal@sun.com * Save off the original data that we're going to 177910105Sadam.leventhal@sun.com * attempt to reconstruct. 178010105Sadam.leventhal@sun.com */ 178110105Sadam.leventhal@sun.com for (i = 0; i < n; i++) { 178210105Sadam.leventhal@sun.com ASSERT(orig[i] != NULL); 178310105Sadam.leventhal@sun.com c = tgts[i]; 178410105Sadam.leventhal@sun.com ASSERT3S(c, >=, 0); 178510105Sadam.leventhal@sun.com ASSERT3S(c, <, rm->rm_cols); 178610105Sadam.leventhal@sun.com rc = &rm->rm_col[c]; 178710105Sadam.leventhal@sun.com bcopy(rc->rc_data, orig[i], rc->rc_size); 178810105Sadam.leventhal@sun.com } 178910105Sadam.leventhal@sun.com 179010105Sadam.leventhal@sun.com /* 179110105Sadam.leventhal@sun.com * Attempt a reconstruction and exit the outer loop on 179210105Sadam.leventhal@sun.com * success. 179310105Sadam.leventhal@sun.com */ 179410105Sadam.leventhal@sun.com code = vdev_raidz_reconstruct(rm, tgts, n); 1795*10614SJonathan.Adams@Sun.COM if (raidz_checksum_verify(zio) == 0) { 179610105Sadam.leventhal@sun.com atomic_inc_64(&raidz_corrected[code]); 179710105Sadam.leventhal@sun.com 179810105Sadam.leventhal@sun.com for (i = 0; i < n; i++) { 179910105Sadam.leventhal@sun.com c = tgts[i]; 180010105Sadam.leventhal@sun.com rc = &rm->rm_col[c]; 180110105Sadam.leventhal@sun.com ASSERT(rc->rc_error == 0); 1802*10614SJonathan.Adams@Sun.COM if (rc->rc_tried) 1803*10614SJonathan.Adams@Sun.COM raidz_checksum_error(zio, rc, 1804*10614SJonathan.Adams@Sun.COM orig[i]); 180510105Sadam.leventhal@sun.com rc->rc_error = ECKSUM; 180610105Sadam.leventhal@sun.com } 180710105Sadam.leventhal@sun.com 180810105Sadam.leventhal@sun.com ret = code; 180910105Sadam.leventhal@sun.com goto done; 181010105Sadam.leventhal@sun.com } 181110105Sadam.leventhal@sun.com 181210105Sadam.leventhal@sun.com /* 181310105Sadam.leventhal@sun.com * Restore the original data. 181410105Sadam.leventhal@sun.com */ 181510105Sadam.leventhal@sun.com for (i = 0; i < n; i++) { 181610105Sadam.leventhal@sun.com c = tgts[i]; 181710105Sadam.leventhal@sun.com rc = &rm->rm_col[c]; 181810105Sadam.leventhal@sun.com bcopy(orig[i], rc->rc_data, rc->rc_size); 181910105Sadam.leventhal@sun.com } 182010105Sadam.leventhal@sun.com 182110105Sadam.leventhal@sun.com do { 182210105Sadam.leventhal@sun.com /* 182310105Sadam.leventhal@sun.com * Find the next valid column after the current 182410105Sadam.leventhal@sun.com * position.. 182510105Sadam.leventhal@sun.com */ 182610105Sadam.leventhal@sun.com for (next = tgts[current] + 1; 182710105Sadam.leventhal@sun.com next < rm->rm_cols && 182810105Sadam.leventhal@sun.com rm->rm_col[next].rc_error != 0; next++) 182910105Sadam.leventhal@sun.com continue; 183010105Sadam.leventhal@sun.com 183110105Sadam.leventhal@sun.com ASSERT(next <= tgts[current + 1]); 183210105Sadam.leventhal@sun.com 183310105Sadam.leventhal@sun.com /* 183410105Sadam.leventhal@sun.com * If that spot is available, we're done here. 183510105Sadam.leventhal@sun.com */ 183610105Sadam.leventhal@sun.com if (next != tgts[current + 1]) 183710105Sadam.leventhal@sun.com break; 183810105Sadam.leventhal@sun.com 183910105Sadam.leventhal@sun.com /* 184010105Sadam.leventhal@sun.com * Otherwise, find the next valid column after 184110105Sadam.leventhal@sun.com * the previous position. 184210105Sadam.leventhal@sun.com */ 184310105Sadam.leventhal@sun.com for (c = tgts[current - 1] + 1; 184410105Sadam.leventhal@sun.com rm->rm_col[c].rc_error != 0; c++) 184510105Sadam.leventhal@sun.com continue; 184610105Sadam.leventhal@sun.com 184710105Sadam.leventhal@sun.com tgts[current] = c; 184810105Sadam.leventhal@sun.com current++; 184910105Sadam.leventhal@sun.com 185010105Sadam.leventhal@sun.com } while (current != n); 185110105Sadam.leventhal@sun.com } 185210105Sadam.leventhal@sun.com } 185310105Sadam.leventhal@sun.com n--; 185410105Sadam.leventhal@sun.com done: 185510105Sadam.leventhal@sun.com for (i = 0; i < n; i++) { 185610105Sadam.leventhal@sun.com zio_buf_free(orig[i], rm->rm_col[0].rc_size); 185710105Sadam.leventhal@sun.com } 185810105Sadam.leventhal@sun.com 185910105Sadam.leventhal@sun.com return (ret); 186010105Sadam.leventhal@sun.com } 186110105Sadam.leventhal@sun.com 18627754SJeff.Bonwick@Sun.COM static void 1863789Sahrens vdev_raidz_io_done(zio_t *zio) 1864789Sahrens { 1865789Sahrens vdev_t *vd = zio->io_vd; 1866789Sahrens vdev_t *cvd; 1867789Sahrens raidz_map_t *rm = zio->io_vsd; 186810105Sadam.leventhal@sun.com raidz_col_t *rc; 1869789Sahrens int unexpected_errors = 0; 18702082Seschrock int parity_errors = 0; 18713456Sahl int parity_untried = 0; 18722082Seschrock int data_errors = 0; 18737754SJeff.Bonwick@Sun.COM int total_errors = 0; 187410105Sadam.leventhal@sun.com int n, c; 187510105Sadam.leventhal@sun.com int tgts[VDEV_RAIDZ_MAXPARITY]; 187610105Sadam.leventhal@sun.com int code; 1877789Sahrens 18781775Sbillm ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 1879789Sahrens 18802082Seschrock ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 18812082Seschrock ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 18822082Seschrock 1883789Sahrens for (c = 0; c < rm->rm_cols; c++) { 1884789Sahrens rc = &rm->rm_col[c]; 1885789Sahrens 1886789Sahrens if (rc->rc_error) { 18877754SJeff.Bonwick@Sun.COM ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 18882082Seschrock 18892082Seschrock if (c < rm->rm_firstdatacol) 18902082Seschrock parity_errors++; 18912082Seschrock else 18922082Seschrock data_errors++; 18932082Seschrock 1894789Sahrens if (!rc->rc_skipped) 1895789Sahrens unexpected_errors++; 18962082Seschrock 18977754SJeff.Bonwick@Sun.COM total_errors++; 18983456Sahl } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 18993456Sahl parity_untried++; 1900789Sahrens } 1901789Sahrens } 1902789Sahrens 1903789Sahrens if (zio->io_type == ZIO_TYPE_WRITE) { 1904789Sahrens /* 19057754SJeff.Bonwick@Sun.COM * XXX -- for now, treat partial writes as a success. 19067754SJeff.Bonwick@Sun.COM * (If we couldn't write enough columns to reconstruct 19077754SJeff.Bonwick@Sun.COM * the data, the I/O failed. Otherwise, good enough.) 19087754SJeff.Bonwick@Sun.COM * 19097754SJeff.Bonwick@Sun.COM * Now that we support write reallocation, it would be better 19107754SJeff.Bonwick@Sun.COM * to treat partial failure as real failure unless there are 19117754SJeff.Bonwick@Sun.COM * no non-degraded top-level vdevs left, and not update DTLs 19127754SJeff.Bonwick@Sun.COM * if we intend to reallocate. 1913789Sahrens */ 1914789Sahrens /* XXPOLICY */ 19157754SJeff.Bonwick@Sun.COM if (total_errors > rm->rm_firstdatacol) 19167754SJeff.Bonwick@Sun.COM zio->io_error = vdev_raidz_worst_error(rm); 1917789Sahrens 19187754SJeff.Bonwick@Sun.COM return; 1919789Sahrens } 1920789Sahrens 1921789Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ); 19222082Seschrock /* 19232082Seschrock * There are three potential phases for a read: 19242082Seschrock * 1. produce valid data from the columns read 19252082Seschrock * 2. read all disks and try again 19262082Seschrock * 3. perform combinatorial reconstruction 19272082Seschrock * 19282082Seschrock * Each phase is progressively both more expensive and less likely to 19292082Seschrock * occur. If we encounter more errors than we can repair or all phases 19302082Seschrock * fail, we have no choice but to return an error. 19312082Seschrock */ 1932789Sahrens 1933789Sahrens /* 19342082Seschrock * If the number of errors we saw was correctable -- less than or equal 19353456Sahl * to the number of parity disks read -- attempt to produce data that 19363456Sahl * has a valid checksum. Naturally, this case applies in the absence of 19373456Sahl * any errors. 1938789Sahrens */ 19397754SJeff.Bonwick@Sun.COM if (total_errors <= rm->rm_firstdatacol - parity_untried) { 194010105Sadam.leventhal@sun.com if (data_errors == 0) { 1941*10614SJonathan.Adams@Sun.COM if (raidz_checksum_verify(zio) == 0) { 19424034Sahl /* 19434034Sahl * If we read parity information (unnecessarily 19444034Sahl * as it happens since no reconstruction was 19454034Sahl * needed) regenerate and verify the parity. 19464034Sahl * We also regenerate parity when resilvering 19474034Sahl * so we can write it out to the failed device 19484034Sahl * later. 19494034Sahl */ 19503456Sahl if (parity_errors + parity_untried < 19514034Sahl rm->rm_firstdatacol || 19524034Sahl (zio->io_flags & ZIO_FLAG_RESILVER)) { 19533456Sahl n = raidz_parity_verify(zio, rm); 19543456Sahl unexpected_errors += n; 19553456Sahl ASSERT(parity_errors + n <= 19563456Sahl rm->rm_firstdatacol); 19573456Sahl } 19582082Seschrock goto done; 19592082Seschrock } 196010105Sadam.leventhal@sun.com } else { 19613456Sahl /* 19623456Sahl * We either attempt to read all the parity columns or 19633456Sahl * none of them. If we didn't try to read parity, we 19643456Sahl * wouldn't be here in the correctable case. There must 19653456Sahl * also have been fewer parity errors than parity 19663456Sahl * columns or, again, we wouldn't be in this code path. 19673456Sahl */ 19683456Sahl ASSERT(parity_untried == 0); 19692082Seschrock ASSERT(parity_errors < rm->rm_firstdatacol); 19702082Seschrock 19712082Seschrock /* 197210105Sadam.leventhal@sun.com * Identify the data columns that reported an error. 19732082Seschrock */ 197410105Sadam.leventhal@sun.com n = 0; 19752082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 19762082Seschrock rc = &rm->rm_col[c]; 197710105Sadam.leventhal@sun.com if (rc->rc_error != 0) { 197810105Sadam.leventhal@sun.com ASSERT(n < VDEV_RAIDZ_MAXPARITY); 197910105Sadam.leventhal@sun.com tgts[n++] = c; 198010105Sadam.leventhal@sun.com } 19812082Seschrock } 19822082Seschrock 198310105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol >= n); 198410105Sadam.leventhal@sun.com 198510105Sadam.leventhal@sun.com code = vdev_raidz_reconstruct(rm, tgts, n); 19862082Seschrock 1987*10614SJonathan.Adams@Sun.COM if (raidz_checksum_verify(zio) == 0) { 198810105Sadam.leventhal@sun.com atomic_inc_64(&raidz_corrected[code]); 1989789Sahrens 19902082Seschrock /* 199110105Sadam.leventhal@sun.com * If we read more parity disks than were used 199210105Sadam.leventhal@sun.com * for reconstruction, confirm that the other 199310105Sadam.leventhal@sun.com * parity disks produced correct data. This 199410105Sadam.leventhal@sun.com * routine is suboptimal in that it regenerates 199510105Sadam.leventhal@sun.com * the parity that we already used in addition 199610105Sadam.leventhal@sun.com * to the parity that we're attempting to 199710105Sadam.leventhal@sun.com * verify, but this should be a relatively 199810105Sadam.leventhal@sun.com * uncommon case, and can be optimized if it 199910105Sadam.leventhal@sun.com * becomes a problem. Note that we regenerate 200010105Sadam.leventhal@sun.com * parity when resilvering so we can write it 200110105Sadam.leventhal@sun.com * out to failed devices later. 20022082Seschrock */ 200310105Sadam.leventhal@sun.com if (parity_errors < rm->rm_firstdatacol - n || 20044034Sahl (zio->io_flags & ZIO_FLAG_RESILVER)) { 20052082Seschrock n = raidz_parity_verify(zio, rm); 20062082Seschrock unexpected_errors += n; 20072082Seschrock ASSERT(parity_errors + n <= 20082082Seschrock rm->rm_firstdatacol); 20092082Seschrock } 20102082Seschrock 20112082Seschrock goto done; 20122082Seschrock } 2013789Sahrens } 2014789Sahrens } 2015789Sahrens 2016789Sahrens /* 20172082Seschrock * This isn't a typical situation -- either we got a read error or 20182082Seschrock * a child silently returned bad data. Read every block so we can 20192082Seschrock * try again with as much data and parity as we can track down. If 20202082Seschrock * we've already been through once before, all children will be marked 20212082Seschrock * as tried so we'll proceed to combinatorial reconstruction. 2022789Sahrens */ 2023789Sahrens unexpected_errors = 1; 20242082Seschrock rm->rm_missingdata = 0; 20252082Seschrock rm->rm_missingparity = 0; 2026789Sahrens 20272082Seschrock for (c = 0; c < rm->rm_cols; c++) { 20282082Seschrock if (rm->rm_col[c].rc_tried) 20292082Seschrock continue; 2030789Sahrens 2031789Sahrens zio_vdev_io_redone(zio); 20322082Seschrock do { 2033789Sahrens rc = &rm->rm_col[c]; 2034789Sahrens if (rc->rc_tried) 2035789Sahrens continue; 2036789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, 20372082Seschrock vd->vdev_child[rc->rc_devidx], 2038789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 20397754SJeff.Bonwick@Sun.COM zio->io_type, zio->io_priority, 0, 2040789Sahrens vdev_raidz_child_done, rc)); 20412082Seschrock } while (++c < rm->rm_cols); 20425530Sbonwick 20437754SJeff.Bonwick@Sun.COM return; 2044789Sahrens } 2045789Sahrens 2046789Sahrens /* 20472082Seschrock * At this point we've attempted to reconstruct the data given the 20482082Seschrock * errors we detected, and we've attempted to read all columns. There 20492082Seschrock * must, therefore, be one or more additional problems -- silent errors 20502082Seschrock * resulting in invalid data rather than explicit I/O errors resulting 205110105Sadam.leventhal@sun.com * in absent data. We check if there is enough additional data to 205210105Sadam.leventhal@sun.com * possibly reconstruct the data and then perform combinatorial 205310105Sadam.leventhal@sun.com * reconstruction over all possible combinations. If that fails, 205410105Sadam.leventhal@sun.com * we're cooked. 2055789Sahrens */ 2056*10614SJonathan.Adams@Sun.COM if (total_errors > rm->rm_firstdatacol) { 20577754SJeff.Bonwick@Sun.COM zio->io_error = vdev_raidz_worst_error(rm); 20582082Seschrock 2059*10614SJonathan.Adams@Sun.COM } else if (total_errors < rm->rm_firstdatacol && 2060*10614SJonathan.Adams@Sun.COM (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { 20612082Seschrock /* 206210105Sadam.leventhal@sun.com * If we didn't use all the available parity for the 206310105Sadam.leventhal@sun.com * combinatorial reconstruction, verify that the remaining 206410105Sadam.leventhal@sun.com * parity is correct. 20652082Seschrock */ 206610105Sadam.leventhal@sun.com if (code != (1 << rm->rm_firstdatacol) - 1) 206710105Sadam.leventhal@sun.com (void) raidz_parity_verify(zio, rm); 206810105Sadam.leventhal@sun.com } else { 206910105Sadam.leventhal@sun.com /* 2070*10614SJonathan.Adams@Sun.COM * We're here because either: 2071*10614SJonathan.Adams@Sun.COM * 2072*10614SJonathan.Adams@Sun.COM * total_errors == rm_first_datacol, or 2073*10614SJonathan.Adams@Sun.COM * vdev_raidz_combrec() failed 2074*10614SJonathan.Adams@Sun.COM * 2075*10614SJonathan.Adams@Sun.COM * In either case, there is enough bad data to prevent 2076*10614SJonathan.Adams@Sun.COM * reconstruction. 2077*10614SJonathan.Adams@Sun.COM * 2078*10614SJonathan.Adams@Sun.COM * Start checksum ereports for all children which haven't 2079*10614SJonathan.Adams@Sun.COM * failed. 208010105Sadam.leventhal@sun.com */ 208110105Sadam.leventhal@sun.com zio->io_error = ECKSUM; 20822082Seschrock 2083*10614SJonathan.Adams@Sun.COM for (c = 0; c < rm->rm_cols; c++) { 2084*10614SJonathan.Adams@Sun.COM rc = &rm->rm_col[c]; 2085*10614SJonathan.Adams@Sun.COM if (rc->rc_error == 0) { 2086*10614SJonathan.Adams@Sun.COM zio_bad_cksum_t zbc; 2087*10614SJonathan.Adams@Sun.COM zbc.zbc_has_cksum = 0; 2088*10614SJonathan.Adams@Sun.COM zbc.zbc_injected = rm->rm_ecksuminjected; 2089*10614SJonathan.Adams@Sun.COM 2090*10614SJonathan.Adams@Sun.COM zfs_ereport_start_checksum( 209110105Sadam.leventhal@sun.com zio->io_spa, vd->vdev_child[rc->rc_devidx], 2092*10614SJonathan.Adams@Sun.COM zio, rc->rc_offset, rc->rc_size, 2093*10614SJonathan.Adams@Sun.COM (void *)(uintptr_t)c, &zbc); 20942082Seschrock } 20951544Seschrock } 20961544Seschrock } 2097789Sahrens 2098789Sahrens done: 2099789Sahrens zio_checksum_verified(zio); 2100789Sahrens 21018241SJeff.Bonwick@Sun.COM if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2102789Sahrens (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2103789Sahrens /* 2104789Sahrens * Use the good data we have in hand to repair damaged children. 2105789Sahrens */ 2106789Sahrens for (c = 0; c < rm->rm_cols; c++) { 2107789Sahrens rc = &rm->rm_col[c]; 21082082Seschrock cvd = vd->vdev_child[rc->rc_devidx]; 2109789Sahrens 21101732Sbonwick if (rc->rc_error == 0) 21111732Sbonwick continue; 21121732Sbonwick 21137754SJeff.Bonwick@Sun.COM zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 21141732Sbonwick rc->rc_offset, rc->rc_data, rc->rc_size, 21151732Sbonwick ZIO_TYPE_WRITE, zio->io_priority, 21168241SJeff.Bonwick@Sun.COM ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 21178241SJeff.Bonwick@Sun.COM ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 21181732Sbonwick } 2119789Sahrens } 2120789Sahrens } 2121789Sahrens 2122789Sahrens static void 2123789Sahrens vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2124789Sahrens { 21252082Seschrock if (faulted > vd->vdev_nparity) 21261544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 21271544Seschrock VDEV_AUX_NO_REPLICAS); 2128789Sahrens else if (degraded + faulted != 0) 21291544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2130789Sahrens else 21311544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2132789Sahrens } 2133789Sahrens 2134789Sahrens vdev_ops_t vdev_raidz_ops = { 2135789Sahrens vdev_raidz_open, 2136789Sahrens vdev_raidz_close, 2137789Sahrens vdev_raidz_asize, 2138789Sahrens vdev_raidz_io_start, 2139789Sahrens vdev_raidz_io_done, 2140789Sahrens vdev_raidz_state_change, 2141789Sahrens VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2142789Sahrens B_FALSE /* not a leaf vdev */ 2143789Sahrens }; 2144