1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 212082Seschrock 22789Sahrens /* 239434SMark.Musante@Sun.COM * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24789Sahrens * Use is subject to license terms. 25789Sahrens */ 26789Sahrens 27789Sahrens #include <sys/zfs_context.h> 28789Sahrens #include <sys/spa.h> 29789Sahrens #include <sys/vdev_impl.h> 30789Sahrens #include <sys/zio.h> 31789Sahrens #include <sys/zio_checksum.h> 32789Sahrens #include <sys/fs/zfs.h> 331544Seschrock #include <sys/fm/fs/zfs.h> 34789Sahrens 35789Sahrens /* 36789Sahrens * Virtual device vector for RAID-Z. 372082Seschrock * 3810105Sadam.leventhal@sun.com * This vdev supports single, double, and triple parity. For single parity, 3910105Sadam.leventhal@sun.com * we use a simple XOR of all the data columns. For double or triple parity, 4010105Sadam.leventhal@sun.com * we use a special case of Reed-Solomon coding. This extends the 4110105Sadam.leventhal@sun.com * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 4210105Sadam.leventhal@sun.com * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 4310105Sadam.leventhal@sun.com * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 4410105Sadam.leventhal@sun.com * former is also based. The latter is designed to provide higher performance 4510105Sadam.leventhal@sun.com * for writes. 4610105Sadam.leventhal@sun.com * 4710105Sadam.leventhal@sun.com * Note that the Plank paper claimed to support arbitrary N+M, but was then 4810105Sadam.leventhal@sun.com * amended six years later identifying a critical flaw that invalidates its 4910105Sadam.leventhal@sun.com * claims. Nevertheless, the technique can be adapted to work for up to 5010105Sadam.leventhal@sun.com * triple parity. For additional parity, the amendment "Note: Correction to 5110105Sadam.leventhal@sun.com * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 5210105Sadam.leventhal@sun.com * is viable, but the additional complexity means that write performance will 5310105Sadam.leventhal@sun.com * suffer. 5410105Sadam.leventhal@sun.com * 5510105Sadam.leventhal@sun.com * All of the methods above operate on a Galois field, defined over the 5610105Sadam.leventhal@sun.com * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 5710105Sadam.leventhal@sun.com * can be expressed with a single byte. Briefly, the operations on the 5810105Sadam.leventhal@sun.com * field are defined as follows: 592082Seschrock * 602082Seschrock * o addition (+) is represented by a bitwise XOR 612082Seschrock * o subtraction (-) is therefore identical to addition: A + B = A - B 622082Seschrock * o multiplication of A by 2 is defined by the following bitwise expression: 632082Seschrock * (A * 2)_7 = A_6 642082Seschrock * (A * 2)_6 = A_5 652082Seschrock * (A * 2)_5 = A_4 662082Seschrock * (A * 2)_4 = A_3 + A_7 672082Seschrock * (A * 2)_3 = A_2 + A_7 682082Seschrock * (A * 2)_2 = A_1 + A_7 692082Seschrock * (A * 2)_1 = A_0 702082Seschrock * (A * 2)_0 = A_7 712082Seschrock * 722082Seschrock * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 7310105Sadam.leventhal@sun.com * As an aside, this multiplication is derived from the error correcting 7410105Sadam.leventhal@sun.com * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 752082Seschrock * 762082Seschrock * Observe that any number in the field (except for 0) can be expressed as a 772082Seschrock * power of 2 -- a generator for the field. We store a table of the powers of 782082Seschrock * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 792082Seschrock * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 8010105Sadam.leventhal@sun.com * than field addition). The inverse of a field element A (A^-1) is therefore 8110105Sadam.leventhal@sun.com * A ^ (255 - 1) = A^254. 822082Seschrock * 8310105Sadam.leventhal@sun.com * The up-to-three parity columns, P, Q, R over several data columns, 8410105Sadam.leventhal@sun.com * D_0, ... D_n-1, can be expressed by field operations: 852082Seschrock * 862082Seschrock * P = D_0 + D_1 + ... + D_n-2 + D_n-1 872082Seschrock * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 882082Seschrock * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 8910105Sadam.leventhal@sun.com * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 9010105Sadam.leventhal@sun.com * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 912082Seschrock * 9210105Sadam.leventhal@sun.com * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 9310105Sadam.leventhal@sun.com * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 9410105Sadam.leventhal@sun.com * independent coefficients. (There are no additional coefficients that have 9510105Sadam.leventhal@sun.com * this property which is why the uncorrected Plank method breaks down.) 9610105Sadam.leventhal@sun.com * 9710105Sadam.leventhal@sun.com * See the reconstruction code below for how P, Q and R can used individually 9810105Sadam.leventhal@sun.com * or in concert to recover missing data columns. 99789Sahrens */ 100789Sahrens 101789Sahrens typedef struct raidz_col { 1022082Seschrock uint64_t rc_devidx; /* child device index for I/O */ 1032082Seschrock uint64_t rc_offset; /* device offset */ 1042082Seschrock uint64_t rc_size; /* I/O size */ 1052082Seschrock void *rc_data; /* I/O data */ 10610614SJonathan.Adams@Sun.COM void *rc_gdata; /* used to store the "good" version */ 1072082Seschrock int rc_error; /* I/O error for this device */ 1082082Seschrock uint8_t rc_tried; /* Did we attempt this I/O column? */ 1092082Seschrock uint8_t rc_skipped; /* Did we skip this I/O column? */ 110789Sahrens } raidz_col_t; 111789Sahrens 112789Sahrens typedef struct raidz_map { 11310105Sadam.leventhal@sun.com uint64_t rm_cols; /* Regular column count */ 11410105Sadam.leventhal@sun.com uint64_t rm_scols; /* Count including skipped columns */ 1152082Seschrock uint64_t rm_bigcols; /* Number of oversized columns */ 1162082Seschrock uint64_t rm_asize; /* Actual total I/O size */ 1172082Seschrock uint64_t rm_missingdata; /* Count of missing data devices */ 1182082Seschrock uint64_t rm_missingparity; /* Count of missing parity devices */ 1192082Seschrock uint64_t rm_firstdatacol; /* First data column/parity count */ 12010450Sadam.leventhal@sun.com uint64_t rm_nskip; /* Skipped sectors for padding */ 12110450Sadam.leventhal@sun.com uint64_t rm_skipstart; /* Column index of padding start */ 12210614SJonathan.Adams@Sun.COM void *rm_datacopy; /* rm_asize-buffer of copied data */ 12310614SJonathan.Adams@Sun.COM uintptr_t rm_reports; /* # of referencing checksum reports */ 12410614SJonathan.Adams@Sun.COM uint8_t rm_freed; /* map no longer has referencing ZIO */ 12510614SJonathan.Adams@Sun.COM uint8_t rm_ecksuminjected; /* checksum error was injected */ 1262082Seschrock raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 127789Sahrens } raidz_map_t; 128789Sahrens 1292082Seschrock #define VDEV_RAIDZ_P 0 1302082Seschrock #define VDEV_RAIDZ_Q 1 13110105Sadam.leventhal@sun.com #define VDEV_RAIDZ_R 2 13210105Sadam.leventhal@sun.com #define VDEV_RAIDZ_MAXPARITY 3 1332082Seschrock 13410105Sadam.leventhal@sun.com #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 13510105Sadam.leventhal@sun.com #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 1362082Seschrock 13710105Sadam.leventhal@sun.com /* 13810105Sadam.leventhal@sun.com * We provide a mechanism to perform the field multiplication operation on a 13910105Sadam.leventhal@sun.com * 64-bit value all at once rather than a byte at a time. This works by 14010105Sadam.leventhal@sun.com * creating a mask from the top bit in each byte and using that to 14110105Sadam.leventhal@sun.com * conditionally apply the XOR of 0x1d. 14210105Sadam.leventhal@sun.com */ 14310105Sadam.leventhal@sun.com #define VDEV_RAIDZ_64MUL_2(x, mask) \ 14410105Sadam.leventhal@sun.com { \ 14510105Sadam.leventhal@sun.com (mask) = (x) & 0x8080808080808080ULL; \ 14610105Sadam.leventhal@sun.com (mask) = ((mask) << 1) - ((mask) >> 7); \ 14710105Sadam.leventhal@sun.com (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 14810105Sadam.leventhal@sun.com ((mask) & 0x1d1d1d1d1d1d1d1d); \ 14910105Sadam.leventhal@sun.com } 15010105Sadam.leventhal@sun.com 15110105Sadam.leventhal@sun.com #define VDEV_RAIDZ_64MUL_4(x, mask) \ 15210105Sadam.leventhal@sun.com { \ 15310105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2((x), mask); \ 15410105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2((x), mask); \ 15510105Sadam.leventhal@sun.com } 15610105Sadam.leventhal@sun.com 15710105Sadam.leventhal@sun.com /* 15810105Sadam.leventhal@sun.com * Force reconstruction to use the general purpose method. 15910105Sadam.leventhal@sun.com */ 16010105Sadam.leventhal@sun.com int vdev_raidz_default_to_general; 1612082Seschrock 1622082Seschrock /* 1632082Seschrock * These two tables represent powers and logs of 2 in the Galois field defined 1642082Seschrock * above. These values were computed by repeatedly multiplying by 2 as above. 1652082Seschrock */ 1662082Seschrock static const uint8_t vdev_raidz_pow2[256] = { 1672082Seschrock 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 1682082Seschrock 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 1692082Seschrock 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 1702082Seschrock 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 1712082Seschrock 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 1722082Seschrock 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 1732082Seschrock 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 1742082Seschrock 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 1752082Seschrock 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 1762082Seschrock 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 1772082Seschrock 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 1782082Seschrock 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 1792082Seschrock 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 1802082Seschrock 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 1812082Seschrock 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 1822082Seschrock 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 1832082Seschrock 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 1842082Seschrock 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 1852082Seschrock 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 1862082Seschrock 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 1872082Seschrock 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 1882082Seschrock 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 1892082Seschrock 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 1902082Seschrock 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 1912082Seschrock 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 1922082Seschrock 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 1932082Seschrock 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 1942082Seschrock 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 1952082Seschrock 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 1962082Seschrock 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 1972082Seschrock 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 1982082Seschrock 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 1992082Seschrock }; 2002082Seschrock static const uint8_t vdev_raidz_log2[256] = { 2012082Seschrock 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 2022082Seschrock 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 2032082Seschrock 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 2042082Seschrock 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 2052082Seschrock 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 2062082Seschrock 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 2072082Seschrock 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 2082082Seschrock 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 2092082Seschrock 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 2102082Seschrock 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 2112082Seschrock 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 2122082Seschrock 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 2132082Seschrock 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 2142082Seschrock 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 2152082Seschrock 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 2162082Seschrock 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 2172082Seschrock 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 2182082Seschrock 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 2192082Seschrock 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 2202082Seschrock 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 2212082Seschrock 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 2222082Seschrock 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 2232082Seschrock 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 2242082Seschrock 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 2252082Seschrock 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 2262082Seschrock 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 2272082Seschrock 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 2282082Seschrock 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 2292082Seschrock 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 2302082Seschrock 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 2312082Seschrock 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 2322082Seschrock 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 2332082Seschrock }; 2342082Seschrock 23510614SJonathan.Adams@Sun.COM static void vdev_raidz_generate_parity(raidz_map_t *rm); 23610614SJonathan.Adams@Sun.COM 2372082Seschrock /* 2382082Seschrock * Multiply a given number by 2 raised to the given power. 2392082Seschrock */ 2402082Seschrock static uint8_t 2412082Seschrock vdev_raidz_exp2(uint_t a, int exp) 2422082Seschrock { 2432082Seschrock if (a == 0) 2442082Seschrock return (0); 2452082Seschrock 2462082Seschrock ASSERT(exp >= 0); 2472082Seschrock ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 2482082Seschrock 2492082Seschrock exp += vdev_raidz_log2[a]; 2502082Seschrock if (exp > 255) 2512082Seschrock exp -= 255; 2522082Seschrock 2532082Seschrock return (vdev_raidz_pow2[exp]); 2542082Seschrock } 2552082Seschrock 2567754SJeff.Bonwick@Sun.COM static void 25710614SJonathan.Adams@Sun.COM vdev_raidz_map_free(raidz_map_t *rm) 2587754SJeff.Bonwick@Sun.COM { 2597754SJeff.Bonwick@Sun.COM int c; 260*10653SJonathan.Adams@Sun.COM size_t size; 2617754SJeff.Bonwick@Sun.COM 26210614SJonathan.Adams@Sun.COM for (c = 0; c < rm->rm_firstdatacol; c++) { 2637754SJeff.Bonwick@Sun.COM zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 2647754SJeff.Bonwick@Sun.COM 26510614SJonathan.Adams@Sun.COM if (rm->rm_col[c].rc_gdata != NULL) 26610614SJonathan.Adams@Sun.COM zio_buf_free(rm->rm_col[c].rc_gdata, 26710614SJonathan.Adams@Sun.COM rm->rm_col[c].rc_size); 26810614SJonathan.Adams@Sun.COM } 26910614SJonathan.Adams@Sun.COM 270*10653SJonathan.Adams@Sun.COM size = 0; 271*10653SJonathan.Adams@Sun.COM for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 272*10653SJonathan.Adams@Sun.COM size += rm->rm_col[c].rc_size; 273*10653SJonathan.Adams@Sun.COM 27410614SJonathan.Adams@Sun.COM if (rm->rm_datacopy != NULL) 27510614SJonathan.Adams@Sun.COM zio_buf_free(rm->rm_datacopy, size); 27610614SJonathan.Adams@Sun.COM 27710105Sadam.leventhal@sun.com kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 2787754SJeff.Bonwick@Sun.COM } 2797754SJeff.Bonwick@Sun.COM 28010614SJonathan.Adams@Sun.COM static void 28110614SJonathan.Adams@Sun.COM vdev_raidz_map_free_vsd(zio_t *zio) 28210614SJonathan.Adams@Sun.COM { 28310614SJonathan.Adams@Sun.COM raidz_map_t *rm = zio->io_vsd; 28410614SJonathan.Adams@Sun.COM 28510614SJonathan.Adams@Sun.COM ASSERT3U(rm->rm_freed, ==, 0); 28610614SJonathan.Adams@Sun.COM rm->rm_freed = 1; 28710614SJonathan.Adams@Sun.COM 28810614SJonathan.Adams@Sun.COM if (rm->rm_reports == 0) 28910614SJonathan.Adams@Sun.COM vdev_raidz_map_free(rm); 29010614SJonathan.Adams@Sun.COM } 29110614SJonathan.Adams@Sun.COM 29210614SJonathan.Adams@Sun.COM /*ARGSUSED*/ 29310614SJonathan.Adams@Sun.COM static void 29410614SJonathan.Adams@Sun.COM vdev_raidz_cksum_free(void *arg, size_t ignored) 29510614SJonathan.Adams@Sun.COM { 29610614SJonathan.Adams@Sun.COM raidz_map_t *rm = arg; 29710614SJonathan.Adams@Sun.COM 29810614SJonathan.Adams@Sun.COM ASSERT3U(rm->rm_reports, >, 0); 29910614SJonathan.Adams@Sun.COM 300*10653SJonathan.Adams@Sun.COM if (--rm->rm_reports == 0 && rm->rm_freed != 0) 30110614SJonathan.Adams@Sun.COM vdev_raidz_map_free(rm); 30210614SJonathan.Adams@Sun.COM } 30310614SJonathan.Adams@Sun.COM 30410614SJonathan.Adams@Sun.COM static void 30510614SJonathan.Adams@Sun.COM vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) 30610614SJonathan.Adams@Sun.COM { 30710614SJonathan.Adams@Sun.COM raidz_map_t *rm = zcr->zcr_cbdata; 30810614SJonathan.Adams@Sun.COM size_t c = zcr->zcr_cbinfo; 30910614SJonathan.Adams@Sun.COM size_t x; 31010614SJonathan.Adams@Sun.COM 31110614SJonathan.Adams@Sun.COM const char *good = NULL; 31210614SJonathan.Adams@Sun.COM const char *bad = rm->rm_col[c].rc_data; 31310614SJonathan.Adams@Sun.COM 31410614SJonathan.Adams@Sun.COM if (good_data == NULL) { 31510614SJonathan.Adams@Sun.COM zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 31610614SJonathan.Adams@Sun.COM return; 31710614SJonathan.Adams@Sun.COM } 31810614SJonathan.Adams@Sun.COM 31910614SJonathan.Adams@Sun.COM if (c < rm->rm_firstdatacol) { 32010614SJonathan.Adams@Sun.COM /* 32110614SJonathan.Adams@Sun.COM * The first time through, calculate the parity blocks for 32210614SJonathan.Adams@Sun.COM * the good data (this relies on the fact that the good 32310614SJonathan.Adams@Sun.COM * data never changes for a given logical ZIO) 32410614SJonathan.Adams@Sun.COM */ 32510614SJonathan.Adams@Sun.COM if (rm->rm_col[0].rc_gdata == NULL) { 32610614SJonathan.Adams@Sun.COM char *bad_parity[VDEV_RAIDZ_MAXPARITY]; 32710614SJonathan.Adams@Sun.COM char *buf; 32810614SJonathan.Adams@Sun.COM 32910614SJonathan.Adams@Sun.COM /* 33010614SJonathan.Adams@Sun.COM * Set up the rm_col[]s to generate the parity for 33110614SJonathan.Adams@Sun.COM * good_data, first saving the parity bufs and 33210614SJonathan.Adams@Sun.COM * replacing them with buffers to hold the result. 33310614SJonathan.Adams@Sun.COM */ 33410614SJonathan.Adams@Sun.COM for (x = 0; x < rm->rm_firstdatacol; x++) { 33510614SJonathan.Adams@Sun.COM bad_parity[x] = rm->rm_col[x].rc_data; 33610614SJonathan.Adams@Sun.COM rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = 33710614SJonathan.Adams@Sun.COM zio_buf_alloc(rm->rm_col[x].rc_size); 33810614SJonathan.Adams@Sun.COM } 33910614SJonathan.Adams@Sun.COM 34010614SJonathan.Adams@Sun.COM /* fill in the data columns from good_data */ 34110614SJonathan.Adams@Sun.COM buf = (char *)good_data; 34210614SJonathan.Adams@Sun.COM for (; x < rm->rm_cols; x++) { 34310614SJonathan.Adams@Sun.COM rm->rm_col[x].rc_data = buf; 34410614SJonathan.Adams@Sun.COM buf += rm->rm_col[x].rc_size; 34510614SJonathan.Adams@Sun.COM } 34610614SJonathan.Adams@Sun.COM 34710614SJonathan.Adams@Sun.COM /* 34810614SJonathan.Adams@Sun.COM * Construct the parity from the good data. 34910614SJonathan.Adams@Sun.COM */ 35010614SJonathan.Adams@Sun.COM vdev_raidz_generate_parity(rm); 35110614SJonathan.Adams@Sun.COM 35210614SJonathan.Adams@Sun.COM /* restore everything back to its original state */ 35310614SJonathan.Adams@Sun.COM for (x = 0; x < rm->rm_firstdatacol; x++) 35410614SJonathan.Adams@Sun.COM rm->rm_col[x].rc_data = bad_parity[x]; 35510614SJonathan.Adams@Sun.COM 35610614SJonathan.Adams@Sun.COM buf = rm->rm_datacopy; 35710614SJonathan.Adams@Sun.COM for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 35810614SJonathan.Adams@Sun.COM rm->rm_col[x].rc_data = buf; 35910614SJonathan.Adams@Sun.COM buf += rm->rm_col[x].rc_size; 36010614SJonathan.Adams@Sun.COM } 36110614SJonathan.Adams@Sun.COM } 36210614SJonathan.Adams@Sun.COM 36310614SJonathan.Adams@Sun.COM ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 36410614SJonathan.Adams@Sun.COM good = rm->rm_col[c].rc_gdata; 36510614SJonathan.Adams@Sun.COM } else { 36610614SJonathan.Adams@Sun.COM /* adjust good_data to point at the start of our column */ 36710614SJonathan.Adams@Sun.COM good = good_data; 36810614SJonathan.Adams@Sun.COM 36910614SJonathan.Adams@Sun.COM for (x = rm->rm_firstdatacol; x < c; x++) 37010614SJonathan.Adams@Sun.COM good += rm->rm_col[x].rc_size; 37110614SJonathan.Adams@Sun.COM } 37210614SJonathan.Adams@Sun.COM 37310614SJonathan.Adams@Sun.COM /* we drop the ereport if it ends up that the data was good */ 37410614SJonathan.Adams@Sun.COM zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 37510614SJonathan.Adams@Sun.COM } 37610614SJonathan.Adams@Sun.COM 37710614SJonathan.Adams@Sun.COM /* 37810614SJonathan.Adams@Sun.COM * Invoked indirectly by zfs_ereport_start_checksum(), called 37910614SJonathan.Adams@Sun.COM * below when our read operation fails completely. The main point 38010614SJonathan.Adams@Sun.COM * is to keep a copy of everything we read from disk, so that at 38110614SJonathan.Adams@Sun.COM * vdev_raidz_cksum_finish() time we can compare it with the good data. 38210614SJonathan.Adams@Sun.COM */ 38310614SJonathan.Adams@Sun.COM static void 38410614SJonathan.Adams@Sun.COM vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 38510614SJonathan.Adams@Sun.COM { 38610614SJonathan.Adams@Sun.COM size_t c = (size_t)(uintptr_t)arg; 38710614SJonathan.Adams@Sun.COM caddr_t buf; 38810614SJonathan.Adams@Sun.COM 38910614SJonathan.Adams@Sun.COM raidz_map_t *rm = zio->io_vsd; 39010614SJonathan.Adams@Sun.COM size_t size; 39110614SJonathan.Adams@Sun.COM 39210614SJonathan.Adams@Sun.COM /* set up the report and bump the refcount */ 39310614SJonathan.Adams@Sun.COM zcr->zcr_cbdata = rm; 39410614SJonathan.Adams@Sun.COM zcr->zcr_cbinfo = c; 39510614SJonathan.Adams@Sun.COM zcr->zcr_finish = vdev_raidz_cksum_finish; 39610614SJonathan.Adams@Sun.COM zcr->zcr_free = vdev_raidz_cksum_free; 39710614SJonathan.Adams@Sun.COM 39810614SJonathan.Adams@Sun.COM rm->rm_reports++; 39910614SJonathan.Adams@Sun.COM ASSERT3U(rm->rm_reports, >, 0); 40010614SJonathan.Adams@Sun.COM 401*10653SJonathan.Adams@Sun.COM if (rm->rm_datacopy != NULL) 40210614SJonathan.Adams@Sun.COM return; 40310614SJonathan.Adams@Sun.COM 40410614SJonathan.Adams@Sun.COM /* 405*10653SJonathan.Adams@Sun.COM * It's the first time we're called for this raidz_map_t, so we need 406*10653SJonathan.Adams@Sun.COM * to copy the data aside; there's no guarantee that our zio's buffer 407*10653SJonathan.Adams@Sun.COM * won't be re-used for something else. 40810614SJonathan.Adams@Sun.COM * 409*10653SJonathan.Adams@Sun.COM * Our parity data is already in separate buffers, so there's no need 41010614SJonathan.Adams@Sun.COM * to copy them. 41110614SJonathan.Adams@Sun.COM */ 41210614SJonathan.Adams@Sun.COM 413*10653SJonathan.Adams@Sun.COM size = 0; 414*10653SJonathan.Adams@Sun.COM for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 415*10653SJonathan.Adams@Sun.COM size += rm->rm_col[c].rc_size; 41610614SJonathan.Adams@Sun.COM 41710614SJonathan.Adams@Sun.COM buf = rm->rm_datacopy = zio_buf_alloc(size); 418*10653SJonathan.Adams@Sun.COM 419*10653SJonathan.Adams@Sun.COM for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 42010614SJonathan.Adams@Sun.COM raidz_col_t *col = &rm->rm_col[c]; 42110614SJonathan.Adams@Sun.COM 42210614SJonathan.Adams@Sun.COM bcopy(col->rc_data, buf, col->rc_size); 42310614SJonathan.Adams@Sun.COM col->rc_data = buf; 42410614SJonathan.Adams@Sun.COM 42510614SJonathan.Adams@Sun.COM buf += col->rc_size; 42610614SJonathan.Adams@Sun.COM } 42710614SJonathan.Adams@Sun.COM ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); 42810614SJonathan.Adams@Sun.COM } 42910614SJonathan.Adams@Sun.COM 43010614SJonathan.Adams@Sun.COM static const zio_vsd_ops_t vdev_raidz_vsd_ops = { 43110614SJonathan.Adams@Sun.COM vdev_raidz_map_free_vsd, 43210614SJonathan.Adams@Sun.COM vdev_raidz_cksum_report 43310614SJonathan.Adams@Sun.COM }; 43410614SJonathan.Adams@Sun.COM 435789Sahrens static raidz_map_t * 4362082Seschrock vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, 4372082Seschrock uint64_t nparity) 438789Sahrens { 439789Sahrens raidz_map_t *rm; 440789Sahrens uint64_t b = zio->io_offset >> unit_shift; 441789Sahrens uint64_t s = zio->io_size >> unit_shift; 442789Sahrens uint64_t f = b % dcols; 443789Sahrens uint64_t o = (b / dcols) << unit_shift; 44410105Sadam.leventhal@sun.com uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 445789Sahrens 4462082Seschrock q = s / (dcols - nparity); 4472082Seschrock r = s - q * (dcols - nparity); 4482082Seschrock bc = (r == 0 ? 0 : r + nparity); 44910105Sadam.leventhal@sun.com tot = s + nparity * (q + (r == 0 ? 0 : 1)); 450789Sahrens 45110105Sadam.leventhal@sun.com if (q == 0) { 45210105Sadam.leventhal@sun.com acols = bc; 45310105Sadam.leventhal@sun.com scols = MIN(dcols, roundup(bc, nparity + 1)); 45410105Sadam.leventhal@sun.com } else { 45510105Sadam.leventhal@sun.com acols = dcols; 45610105Sadam.leventhal@sun.com scols = dcols; 45710105Sadam.leventhal@sun.com } 458789Sahrens 45910105Sadam.leventhal@sun.com ASSERT3U(acols, <=, scols); 46010105Sadam.leventhal@sun.com 46110105Sadam.leventhal@sun.com rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 462789Sahrens 463789Sahrens rm->rm_cols = acols; 46410105Sadam.leventhal@sun.com rm->rm_scols = scols; 465789Sahrens rm->rm_bigcols = bc; 46610450Sadam.leventhal@sun.com rm->rm_skipstart = bc; 4672082Seschrock rm->rm_missingdata = 0; 4682082Seschrock rm->rm_missingparity = 0; 4692082Seschrock rm->rm_firstdatacol = nparity; 47010614SJonathan.Adams@Sun.COM rm->rm_datacopy = NULL; 47110614SJonathan.Adams@Sun.COM rm->rm_reports = 0; 47210614SJonathan.Adams@Sun.COM rm->rm_freed = 0; 47310614SJonathan.Adams@Sun.COM rm->rm_ecksuminjected = 0; 474789Sahrens 47510105Sadam.leventhal@sun.com asize = 0; 47610105Sadam.leventhal@sun.com 47710105Sadam.leventhal@sun.com for (c = 0; c < scols; c++) { 478789Sahrens col = f + c; 479789Sahrens coff = o; 480789Sahrens if (col >= dcols) { 481789Sahrens col -= dcols; 482789Sahrens coff += 1ULL << unit_shift; 483789Sahrens } 4842082Seschrock rm->rm_col[c].rc_devidx = col; 485789Sahrens rm->rm_col[c].rc_offset = coff; 486789Sahrens rm->rm_col[c].rc_data = NULL; 48710614SJonathan.Adams@Sun.COM rm->rm_col[c].rc_gdata = NULL; 488789Sahrens rm->rm_col[c].rc_error = 0; 489789Sahrens rm->rm_col[c].rc_tried = 0; 490789Sahrens rm->rm_col[c].rc_skipped = 0; 49110105Sadam.leventhal@sun.com 49210105Sadam.leventhal@sun.com if (c >= acols) 49310105Sadam.leventhal@sun.com rm->rm_col[c].rc_size = 0; 49410105Sadam.leventhal@sun.com else if (c < bc) 49510105Sadam.leventhal@sun.com rm->rm_col[c].rc_size = (q + 1) << unit_shift; 49610105Sadam.leventhal@sun.com else 49710105Sadam.leventhal@sun.com rm->rm_col[c].rc_size = q << unit_shift; 49810105Sadam.leventhal@sun.com 49910105Sadam.leventhal@sun.com asize += rm->rm_col[c].rc_size; 500789Sahrens } 501789Sahrens 50210105Sadam.leventhal@sun.com ASSERT3U(asize, ==, tot << unit_shift); 50310105Sadam.leventhal@sun.com rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 50410450Sadam.leventhal@sun.com rm->rm_nskip = roundup(tot, nparity + 1) - tot; 50510450Sadam.leventhal@sun.com ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 50610450Sadam.leventhal@sun.com ASSERT3U(rm->rm_nskip, <=, nparity); 507789Sahrens 508789Sahrens for (c = 0; c < rm->rm_firstdatacol; c++) 509789Sahrens rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); 510789Sahrens 511789Sahrens rm->rm_col[c].rc_data = zio->io_data; 512789Sahrens 513789Sahrens for (c = c + 1; c < acols; c++) 514789Sahrens rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 515789Sahrens rm->rm_col[c - 1].rc_size; 516789Sahrens 5171133Seschrock /* 5182082Seschrock * If all data stored spans all columns, there's a danger that parity 5192082Seschrock * will always be on the same device and, since parity isn't read 5202082Seschrock * during normal operation, that that device's I/O bandwidth won't be 5212082Seschrock * used effectively. We therefore switch the parity every 1MB. 5222082Seschrock * 5232082Seschrock * ... at least that was, ostensibly, the theory. As a practical 5242082Seschrock * matter unless we juggle the parity between all devices evenly, we 5252082Seschrock * won't see any benefit. Further, occasional writes that aren't a 5262082Seschrock * multiple of the LCM of the number of children and the minimum 5272082Seschrock * stripe width are sufficient to avoid pessimal behavior. 5282082Seschrock * Unfortunately, this decision created an implicit on-disk format 5293456Sahl * requirement that we need to support for all eternity, but only 5303456Sahl * for single-parity RAID-Z. 53110450Sadam.leventhal@sun.com * 53210450Sadam.leventhal@sun.com * If we intend to skip a sector in the zeroth column for padding 53310450Sadam.leventhal@sun.com * we must make sure to note this swap. We will never intend to 53410450Sadam.leventhal@sun.com * skip the first column since at least one data and one parity 53510450Sadam.leventhal@sun.com * column must appear in each row. 5361133Seschrock */ 5371133Seschrock ASSERT(rm->rm_cols >= 2); 5381133Seschrock ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 539789Sahrens 5402082Seschrock if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 5412082Seschrock devidx = rm->rm_col[0].rc_devidx; 5421133Seschrock o = rm->rm_col[0].rc_offset; 5432082Seschrock rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 5441133Seschrock rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 5452082Seschrock rm->rm_col[1].rc_devidx = devidx; 5461133Seschrock rm->rm_col[1].rc_offset = o; 54710450Sadam.leventhal@sun.com 54810450Sadam.leventhal@sun.com if (rm->rm_skipstart == 0) 54910450Sadam.leventhal@sun.com rm->rm_skipstart = 1; 550789Sahrens } 551789Sahrens 552789Sahrens zio->io_vsd = rm; 55310614SJonathan.Adams@Sun.COM zio->io_vsd_ops = &vdev_raidz_vsd_ops; 554789Sahrens return (rm); 555789Sahrens } 556789Sahrens 557789Sahrens static void 5582082Seschrock vdev_raidz_generate_parity_p(raidz_map_t *rm) 5592082Seschrock { 5602082Seschrock uint64_t *p, *src, pcount, ccount, i; 5612082Seschrock int c; 5622082Seschrock 5632082Seschrock pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 5642082Seschrock 5652082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 5662082Seschrock src = rm->rm_col[c].rc_data; 5672082Seschrock p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 5682082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 5692082Seschrock 5702082Seschrock if (c == rm->rm_firstdatacol) { 5712082Seschrock ASSERT(ccount == pcount); 57210105Sadam.leventhal@sun.com for (i = 0; i < ccount; i++, src++, p++) { 5732082Seschrock *p = *src; 5742082Seschrock } 5752082Seschrock } else { 5762082Seschrock ASSERT(ccount <= pcount); 57710105Sadam.leventhal@sun.com for (i = 0; i < ccount; i++, src++, p++) { 5782082Seschrock *p ^= *src; 5792082Seschrock } 5802082Seschrock } 5812082Seschrock } 5822082Seschrock } 5832082Seschrock 5842082Seschrock static void 5852082Seschrock vdev_raidz_generate_parity_pq(raidz_map_t *rm) 586789Sahrens { 58710105Sadam.leventhal@sun.com uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 5882082Seschrock int c; 5892082Seschrock 59010105Sadam.leventhal@sun.com pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 5912082Seschrock ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 5922082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_size); 5932082Seschrock 5942082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 5952082Seschrock src = rm->rm_col[c].rc_data; 5962082Seschrock p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 5972082Seschrock q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 59810105Sadam.leventhal@sun.com 59910105Sadam.leventhal@sun.com ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 6002082Seschrock 6012082Seschrock if (c == rm->rm_firstdatacol) { 60210105Sadam.leventhal@sun.com ASSERT(ccnt == pcnt || ccnt == 0); 60310105Sadam.leventhal@sun.com for (i = 0; i < ccnt; i++, src++, p++, q++) { 6042082Seschrock *p = *src; 60510105Sadam.leventhal@sun.com *q = *src; 6062082Seschrock } 60710105Sadam.leventhal@sun.com for (; i < pcnt; i++, src++, p++, q++) { 60810105Sadam.leventhal@sun.com *p = 0; 6092082Seschrock *q = 0; 6102082Seschrock } 6112082Seschrock } else { 61210105Sadam.leventhal@sun.com ASSERT(ccnt <= pcnt); 613789Sahrens 6142082Seschrock /* 61510105Sadam.leventhal@sun.com * Apply the algorithm described above by multiplying 61610105Sadam.leventhal@sun.com * the previous result and adding in the new value. 6172082Seschrock */ 61810105Sadam.leventhal@sun.com for (i = 0; i < ccnt; i++, src++, p++, q++) { 61910105Sadam.leventhal@sun.com *p ^= *src; 62010105Sadam.leventhal@sun.com 62110105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*q, mask); 6222082Seschrock *q ^= *src; 6232082Seschrock } 6242082Seschrock 6252082Seschrock /* 6262082Seschrock * Treat short columns as though they are full of 0s. 62710105Sadam.leventhal@sun.com * Note that there's therefore nothing needed for P. 6282082Seschrock */ 62910105Sadam.leventhal@sun.com for (; i < pcnt; i++, q++) { 63010105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*q, mask); 6312082Seschrock } 6322082Seschrock } 6332082Seschrock } 6342082Seschrock } 6352082Seschrock 6362082Seschrock static void 63710105Sadam.leventhal@sun.com vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 63810105Sadam.leventhal@sun.com { 63910105Sadam.leventhal@sun.com uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 64010105Sadam.leventhal@sun.com int c; 64110105Sadam.leventhal@sun.com 64210105Sadam.leventhal@sun.com pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 64310105Sadam.leventhal@sun.com ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 64410105Sadam.leventhal@sun.com rm->rm_col[VDEV_RAIDZ_Q].rc_size); 64510105Sadam.leventhal@sun.com ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 64610105Sadam.leventhal@sun.com rm->rm_col[VDEV_RAIDZ_R].rc_size); 64710105Sadam.leventhal@sun.com 64810105Sadam.leventhal@sun.com for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 64910105Sadam.leventhal@sun.com src = rm->rm_col[c].rc_data; 65010105Sadam.leventhal@sun.com p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 65110105Sadam.leventhal@sun.com q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 65210105Sadam.leventhal@sun.com r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 65310105Sadam.leventhal@sun.com 65410105Sadam.leventhal@sun.com ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 65510105Sadam.leventhal@sun.com 65610105Sadam.leventhal@sun.com if (c == rm->rm_firstdatacol) { 65710105Sadam.leventhal@sun.com ASSERT(ccnt == pcnt || ccnt == 0); 65810105Sadam.leventhal@sun.com for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 65910105Sadam.leventhal@sun.com *p = *src; 66010105Sadam.leventhal@sun.com *q = *src; 66110105Sadam.leventhal@sun.com *r = *src; 66210105Sadam.leventhal@sun.com } 66310105Sadam.leventhal@sun.com for (; i < pcnt; i++, src++, p++, q++, r++) { 66410105Sadam.leventhal@sun.com *p = 0; 66510105Sadam.leventhal@sun.com *q = 0; 66610105Sadam.leventhal@sun.com *r = 0; 66710105Sadam.leventhal@sun.com } 66810105Sadam.leventhal@sun.com } else { 66910105Sadam.leventhal@sun.com ASSERT(ccnt <= pcnt); 67010105Sadam.leventhal@sun.com 67110105Sadam.leventhal@sun.com /* 67210105Sadam.leventhal@sun.com * Apply the algorithm described above by multiplying 67310105Sadam.leventhal@sun.com * the previous result and adding in the new value. 67410105Sadam.leventhal@sun.com */ 67510105Sadam.leventhal@sun.com for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 67610105Sadam.leventhal@sun.com *p ^= *src; 67710105Sadam.leventhal@sun.com 67810105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*q, mask); 67910105Sadam.leventhal@sun.com *q ^= *src; 68010105Sadam.leventhal@sun.com 68110105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_4(*r, mask); 68210105Sadam.leventhal@sun.com *r ^= *src; 68310105Sadam.leventhal@sun.com } 68410105Sadam.leventhal@sun.com 68510105Sadam.leventhal@sun.com /* 68610105Sadam.leventhal@sun.com * Treat short columns as though they are full of 0s. 68710105Sadam.leventhal@sun.com * Note that there's therefore nothing needed for P. 68810105Sadam.leventhal@sun.com */ 68910105Sadam.leventhal@sun.com for (; i < pcnt; i++, q++, r++) { 69010105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*q, mask); 69110105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_4(*r, mask); 69210105Sadam.leventhal@sun.com } 69310105Sadam.leventhal@sun.com } 69410105Sadam.leventhal@sun.com } 69510105Sadam.leventhal@sun.com } 69610105Sadam.leventhal@sun.com 69710105Sadam.leventhal@sun.com /* 69810105Sadam.leventhal@sun.com * Generate RAID parity in the first virtual columns according to the number of 69910105Sadam.leventhal@sun.com * parity columns available. 70010105Sadam.leventhal@sun.com */ 70110105Sadam.leventhal@sun.com static void 70210105Sadam.leventhal@sun.com vdev_raidz_generate_parity(raidz_map_t *rm) 70310105Sadam.leventhal@sun.com { 70410105Sadam.leventhal@sun.com switch (rm->rm_firstdatacol) { 70510105Sadam.leventhal@sun.com case 1: 70610105Sadam.leventhal@sun.com vdev_raidz_generate_parity_p(rm); 70710105Sadam.leventhal@sun.com break; 70810105Sadam.leventhal@sun.com case 2: 70910105Sadam.leventhal@sun.com vdev_raidz_generate_parity_pq(rm); 71010105Sadam.leventhal@sun.com break; 71110105Sadam.leventhal@sun.com case 3: 71210105Sadam.leventhal@sun.com vdev_raidz_generate_parity_pqr(rm); 71310105Sadam.leventhal@sun.com break; 71410105Sadam.leventhal@sun.com default: 71510105Sadam.leventhal@sun.com cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 71610105Sadam.leventhal@sun.com } 71710105Sadam.leventhal@sun.com } 71810105Sadam.leventhal@sun.com 71910105Sadam.leventhal@sun.com static int 72010105Sadam.leventhal@sun.com vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 7212082Seschrock { 7222082Seschrock uint64_t *dst, *src, xcount, ccount, count, i; 72310105Sadam.leventhal@sun.com int x = tgts[0]; 7242082Seschrock int c; 7252082Seschrock 72610105Sadam.leventhal@sun.com ASSERT(ntgts == 1); 72710105Sadam.leventhal@sun.com ASSERT(x >= rm->rm_firstdatacol); 72810105Sadam.leventhal@sun.com ASSERT(x < rm->rm_cols); 72910105Sadam.leventhal@sun.com 7302082Seschrock xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 7312082Seschrock ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 7322082Seschrock ASSERT(xcount > 0); 7332082Seschrock 7342082Seschrock src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 7352082Seschrock dst = rm->rm_col[x].rc_data; 7362082Seschrock for (i = 0; i < xcount; i++, dst++, src++) { 7372082Seschrock *dst = *src; 7382082Seschrock } 7392082Seschrock 7402082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 741789Sahrens src = rm->rm_col[c].rc_data; 742789Sahrens dst = rm->rm_col[x].rc_data; 7432082Seschrock 7442082Seschrock if (c == x) 7452082Seschrock continue; 7462082Seschrock 7472082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 7482082Seschrock count = MIN(ccount, xcount); 7492082Seschrock 7502082Seschrock for (i = 0; i < count; i++, dst++, src++) { 7512082Seschrock *dst ^= *src; 752789Sahrens } 753789Sahrens } 75410105Sadam.leventhal@sun.com 75510105Sadam.leventhal@sun.com return (1 << VDEV_RAIDZ_P); 756789Sahrens } 757789Sahrens 75810105Sadam.leventhal@sun.com static int 75910105Sadam.leventhal@sun.com vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 7602082Seschrock { 7612082Seschrock uint64_t *dst, *src, xcount, ccount, count, mask, i; 7622082Seschrock uint8_t *b; 76310105Sadam.leventhal@sun.com int x = tgts[0]; 7642082Seschrock int c, j, exp; 7652082Seschrock 76610105Sadam.leventhal@sun.com ASSERT(ntgts == 1); 76710105Sadam.leventhal@sun.com 7682082Seschrock xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 7692082Seschrock ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 7702082Seschrock 7712082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 7722082Seschrock src = rm->rm_col[c].rc_data; 7732082Seschrock dst = rm->rm_col[x].rc_data; 7742082Seschrock 7752082Seschrock if (c == x) 7762082Seschrock ccount = 0; 7772082Seschrock else 7782082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 7792082Seschrock 7802082Seschrock count = MIN(ccount, xcount); 7812082Seschrock 7822082Seschrock if (c == rm->rm_firstdatacol) { 7832082Seschrock for (i = 0; i < count; i++, dst++, src++) { 7842082Seschrock *dst = *src; 7852082Seschrock } 7862082Seschrock for (; i < xcount; i++, dst++) { 7872082Seschrock *dst = 0; 7882082Seschrock } 7892082Seschrock 7902082Seschrock } else { 7912082Seschrock for (i = 0; i < count; i++, dst++, src++) { 79210105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*dst, mask); 7932082Seschrock *dst ^= *src; 7942082Seschrock } 7952082Seschrock 7962082Seschrock for (; i < xcount; i++, dst++) { 79710105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*dst, mask); 7982082Seschrock } 7992082Seschrock } 8002082Seschrock } 8012082Seschrock 8022082Seschrock src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 8032082Seschrock dst = rm->rm_col[x].rc_data; 8042082Seschrock exp = 255 - (rm->rm_cols - 1 - x); 8052082Seschrock 8062082Seschrock for (i = 0; i < xcount; i++, dst++, src++) { 8072082Seschrock *dst ^= *src; 8082082Seschrock for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 8092082Seschrock *b = vdev_raidz_exp2(*b, exp); 8102082Seschrock } 8112082Seschrock } 81210105Sadam.leventhal@sun.com 81310105Sadam.leventhal@sun.com return (1 << VDEV_RAIDZ_Q); 8142082Seschrock } 8152082Seschrock 81610105Sadam.leventhal@sun.com static int 81710105Sadam.leventhal@sun.com vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 8182082Seschrock { 8192082Seschrock uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 8202082Seschrock void *pdata, *qdata; 8212082Seschrock uint64_t xsize, ysize, i; 82210105Sadam.leventhal@sun.com int x = tgts[0]; 82310105Sadam.leventhal@sun.com int y = tgts[1]; 8242082Seschrock 82510105Sadam.leventhal@sun.com ASSERT(ntgts == 2); 8262082Seschrock ASSERT(x < y); 8272082Seschrock ASSERT(x >= rm->rm_firstdatacol); 8282082Seschrock ASSERT(y < rm->rm_cols); 8292082Seschrock 8302082Seschrock ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 8312082Seschrock 8322082Seschrock /* 8332082Seschrock * Move the parity data aside -- we're going to compute parity as 8342082Seschrock * though columns x and y were full of zeros -- Pxy and Qxy. We want to 8352082Seschrock * reuse the parity generation mechanism without trashing the actual 8362082Seschrock * parity so we make those columns appear to be full of zeros by 8372082Seschrock * setting their lengths to zero. 8382082Seschrock */ 8392082Seschrock pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 8402082Seschrock qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 8412082Seschrock xsize = rm->rm_col[x].rc_size; 8422082Seschrock ysize = rm->rm_col[y].rc_size; 8432082Seschrock 8442082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_data = 8452082Seschrock zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 8462082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_data = 8472082Seschrock zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 8482082Seschrock rm->rm_col[x].rc_size = 0; 8492082Seschrock rm->rm_col[y].rc_size = 0; 8502082Seschrock 8512082Seschrock vdev_raidz_generate_parity_pq(rm); 8522082Seschrock 8532082Seschrock rm->rm_col[x].rc_size = xsize; 8542082Seschrock rm->rm_col[y].rc_size = ysize; 8552082Seschrock 8562082Seschrock p = pdata; 8572082Seschrock q = qdata; 8582082Seschrock pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 8592082Seschrock qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 8602082Seschrock xd = rm->rm_col[x].rc_data; 8612082Seschrock yd = rm->rm_col[y].rc_data; 8622082Seschrock 8632082Seschrock /* 8642082Seschrock * We now have: 8652082Seschrock * Pxy = P + D_x + D_y 8662082Seschrock * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 8672082Seschrock * 8682082Seschrock * We can then solve for D_x: 8692082Seschrock * D_x = A * (P + Pxy) + B * (Q + Qxy) 8702082Seschrock * where 8712082Seschrock * A = 2^(x - y) * (2^(x - y) + 1)^-1 8722082Seschrock * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 8732082Seschrock * 8742082Seschrock * With D_x in hand, we can easily solve for D_y: 8752082Seschrock * D_y = P + Pxy + D_x 8762082Seschrock */ 8772082Seschrock 8782082Seschrock a = vdev_raidz_pow2[255 + x - y]; 8792082Seschrock b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 8802082Seschrock tmp = 255 - vdev_raidz_log2[a ^ 1]; 8812082Seschrock 8822082Seschrock aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 8832082Seschrock bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 8842082Seschrock 8852082Seschrock for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 8862082Seschrock *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 8872082Seschrock vdev_raidz_exp2(*q ^ *qxy, bexp); 8882082Seschrock 8892082Seschrock if (i < ysize) 8902082Seschrock *yd = *p ^ *pxy ^ *xd; 8912082Seschrock } 8922082Seschrock 8932082Seschrock zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 8942082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_size); 8952082Seschrock zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 8962082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_size); 8972082Seschrock 8982082Seschrock /* 8992082Seschrock * Restore the saved parity data. 9002082Seschrock */ 9012082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 9022082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 90310105Sadam.leventhal@sun.com 90410105Sadam.leventhal@sun.com return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 90510105Sadam.leventhal@sun.com } 90610105Sadam.leventhal@sun.com 90710105Sadam.leventhal@sun.com /* BEGIN CSTYLED */ 90810105Sadam.leventhal@sun.com /* 90910105Sadam.leventhal@sun.com * In the general case of reconstruction, we must solve the system of linear 91010105Sadam.leventhal@sun.com * equations defined by the coeffecients used to generate parity as well as 91110105Sadam.leventhal@sun.com * the contents of the data and parity disks. This can be expressed with 91210105Sadam.leventhal@sun.com * vectors for the original data (D) and the actual data (d) and parity (p) 91310105Sadam.leventhal@sun.com * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 91410105Sadam.leventhal@sun.com * 91510105Sadam.leventhal@sun.com * __ __ __ __ 91610105Sadam.leventhal@sun.com * | | __ __ | p_0 | 91710105Sadam.leventhal@sun.com * | V | | D_0 | | p_m-1 | 91810105Sadam.leventhal@sun.com * | | x | : | = | d_0 | 91910105Sadam.leventhal@sun.com * | I | | D_n-1 | | : | 92010105Sadam.leventhal@sun.com * | | ~~ ~~ | d_n-1 | 92110105Sadam.leventhal@sun.com * ~~ ~~ ~~ ~~ 92210105Sadam.leventhal@sun.com * 92310105Sadam.leventhal@sun.com * I is simply a square identity matrix of size n, and V is a vandermonde 92410105Sadam.leventhal@sun.com * matrix defined by the coeffecients we chose for the various parity columns 92510105Sadam.leventhal@sun.com * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 92610105Sadam.leventhal@sun.com * computation as well as linear separability. 92710105Sadam.leventhal@sun.com * 92810105Sadam.leventhal@sun.com * __ __ __ __ 92910105Sadam.leventhal@sun.com * | 1 .. 1 1 1 | | p_0 | 93010105Sadam.leventhal@sun.com * | 2^n-1 .. 4 2 1 | __ __ | : | 93110105Sadam.leventhal@sun.com * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 93210105Sadam.leventhal@sun.com * | 1 .. 0 0 0 | | D_1 | | d_0 | 93310105Sadam.leventhal@sun.com * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 93410105Sadam.leventhal@sun.com * | : : : : | | : | | d_2 | 93510105Sadam.leventhal@sun.com * | 0 .. 1 0 0 | | D_n-1 | | : | 93610105Sadam.leventhal@sun.com * | 0 .. 0 1 0 | ~~ ~~ | : | 93710105Sadam.leventhal@sun.com * | 0 .. 0 0 1 | | d_n-1 | 93810105Sadam.leventhal@sun.com * ~~ ~~ ~~ ~~ 93910105Sadam.leventhal@sun.com * 94010105Sadam.leventhal@sun.com * Note that I, V, d, and p are known. To compute D, we must invert the 94110105Sadam.leventhal@sun.com * matrix and use the known data and parity values to reconstruct the unknown 94210105Sadam.leventhal@sun.com * data values. We begin by removing the rows in V|I and d|p that correspond 94310105Sadam.leventhal@sun.com * to failed or missing columns; we then make V|I square (n x n) and d|p 94410105Sadam.leventhal@sun.com * sized n by removing rows corresponding to unused parity from the bottom up 94510105Sadam.leventhal@sun.com * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 94610105Sadam.leventhal@sun.com * using Gauss-Jordan elimination. In the example below we use m=3 parity 94710105Sadam.leventhal@sun.com * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 94810105Sadam.leventhal@sun.com * __ __ 94910105Sadam.leventhal@sun.com * | 1 1 1 1 1 1 1 1 | 95010105Sadam.leventhal@sun.com * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 95110105Sadam.leventhal@sun.com * | 19 205 116 29 64 16 4 1 | / / 95210105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 | / / 95310105Sadam.leventhal@sun.com * | 0 1 0 0 0 0 0 0 | <--' / 95410105Sadam.leventhal@sun.com * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 95510105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 | 95610105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 | 95710105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 | 95810105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 | 95910105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 | 96010105Sadam.leventhal@sun.com * ~~ ~~ 96110105Sadam.leventhal@sun.com * __ __ 96210105Sadam.leventhal@sun.com * | 1 1 1 1 1 1 1 1 | 96310105Sadam.leventhal@sun.com * | 128 64 32 16 8 4 2 1 | 96410105Sadam.leventhal@sun.com * | 19 205 116 29 64 16 4 1 | 96510105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 | 96610105Sadam.leventhal@sun.com * | 0 1 0 0 0 0 0 0 | 96710105Sadam.leventhal@sun.com * (V|I)' = | 0 0 1 0 0 0 0 0 | 96810105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 | 96910105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 | 97010105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 | 97110105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 | 97210105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 | 97310105Sadam.leventhal@sun.com * ~~ ~~ 97410105Sadam.leventhal@sun.com * 97510105Sadam.leventhal@sun.com * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 97610105Sadam.leventhal@sun.com * have carefully chosen the seed values 1, 2, and 4 to ensure that this 97710105Sadam.leventhal@sun.com * matrix is not singular. 97810105Sadam.leventhal@sun.com * __ __ 97910105Sadam.leventhal@sun.com * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 98010105Sadam.leventhal@sun.com * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 98110105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 98210105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 98310105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 98410105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 98510105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 98610105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 98710105Sadam.leventhal@sun.com * ~~ ~~ 98810105Sadam.leventhal@sun.com * __ __ 98910105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 99010105Sadam.leventhal@sun.com * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 99110105Sadam.leventhal@sun.com * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 99210105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 99310105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 99410105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 99510105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 99610105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 99710105Sadam.leventhal@sun.com * ~~ ~~ 99810105Sadam.leventhal@sun.com * __ __ 99910105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 100010105Sadam.leventhal@sun.com * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 100110105Sadam.leventhal@sun.com * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 100210105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 100310105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 100410105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 100510105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 100610105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 100710105Sadam.leventhal@sun.com * ~~ ~~ 100810105Sadam.leventhal@sun.com * __ __ 100910105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 101010105Sadam.leventhal@sun.com * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 101110105Sadam.leventhal@sun.com * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 101210105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 101310105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 101410105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 101510105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 101610105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 101710105Sadam.leventhal@sun.com * ~~ ~~ 101810105Sadam.leventhal@sun.com * __ __ 101910105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 102010105Sadam.leventhal@sun.com * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 102110105Sadam.leventhal@sun.com * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 102210105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 102310105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 102410105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 102510105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 102610105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 102710105Sadam.leventhal@sun.com * ~~ ~~ 102810105Sadam.leventhal@sun.com * __ __ 102910105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 103010105Sadam.leventhal@sun.com * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 103110105Sadam.leventhal@sun.com * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 103210105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 103310105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 103410105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 103510105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 103610105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 103710105Sadam.leventhal@sun.com * ~~ ~~ 103810105Sadam.leventhal@sun.com * __ __ 103910105Sadam.leventhal@sun.com * | 0 0 1 0 0 0 0 0 | 104010105Sadam.leventhal@sun.com * | 167 100 5 41 159 169 217 208 | 104110105Sadam.leventhal@sun.com * | 166 100 4 40 158 168 216 209 | 104210105Sadam.leventhal@sun.com * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 104310105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 | 104410105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 | 104510105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 | 104610105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 | 104710105Sadam.leventhal@sun.com * ~~ ~~ 104810105Sadam.leventhal@sun.com * 104910105Sadam.leventhal@sun.com * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 105010105Sadam.leventhal@sun.com * of the missing data. 105110105Sadam.leventhal@sun.com * 105210105Sadam.leventhal@sun.com * As is apparent from the example above, the only non-trivial rows in the 105310105Sadam.leventhal@sun.com * inverse matrix correspond to the data disks that we're trying to 105410105Sadam.leventhal@sun.com * reconstruct. Indeed, those are the only rows we need as the others would 105510105Sadam.leventhal@sun.com * only be useful for reconstructing data known or assumed to be valid. For 105610105Sadam.leventhal@sun.com * that reason, we only build the coefficients in the rows that correspond to 105710105Sadam.leventhal@sun.com * targeted columns. 105810105Sadam.leventhal@sun.com */ 105910105Sadam.leventhal@sun.com /* END CSTYLED */ 106010105Sadam.leventhal@sun.com 106110105Sadam.leventhal@sun.com static void 106210105Sadam.leventhal@sun.com vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 106310105Sadam.leventhal@sun.com uint8_t **rows) 106410105Sadam.leventhal@sun.com { 106510105Sadam.leventhal@sun.com int i, j; 106610105Sadam.leventhal@sun.com int pow; 106710105Sadam.leventhal@sun.com 106810105Sadam.leventhal@sun.com ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 106910105Sadam.leventhal@sun.com 107010105Sadam.leventhal@sun.com /* 107110105Sadam.leventhal@sun.com * Fill in the missing rows of interest. 107210105Sadam.leventhal@sun.com */ 107310105Sadam.leventhal@sun.com for (i = 0; i < nmap; i++) { 107410105Sadam.leventhal@sun.com ASSERT3S(0, <=, map[i]); 107510105Sadam.leventhal@sun.com ASSERT3S(map[i], <=, 2); 107610105Sadam.leventhal@sun.com 107710105Sadam.leventhal@sun.com pow = map[i] * n; 107810105Sadam.leventhal@sun.com if (pow > 255) 107910105Sadam.leventhal@sun.com pow -= 255; 108010105Sadam.leventhal@sun.com ASSERT(pow <= 255); 108110105Sadam.leventhal@sun.com 108210105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 108310105Sadam.leventhal@sun.com pow -= map[i]; 108410105Sadam.leventhal@sun.com if (pow < 0) 108510105Sadam.leventhal@sun.com pow += 255; 108610105Sadam.leventhal@sun.com rows[i][j] = vdev_raidz_pow2[pow]; 108710105Sadam.leventhal@sun.com } 108810105Sadam.leventhal@sun.com } 10892082Seschrock } 10902082Seschrock 109110105Sadam.leventhal@sun.com static void 109210105Sadam.leventhal@sun.com vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 109310105Sadam.leventhal@sun.com uint8_t **rows, uint8_t **invrows, const uint8_t *used) 109410105Sadam.leventhal@sun.com { 109510105Sadam.leventhal@sun.com int i, j, ii, jj; 109610105Sadam.leventhal@sun.com uint8_t log; 109710105Sadam.leventhal@sun.com 109810105Sadam.leventhal@sun.com /* 109910105Sadam.leventhal@sun.com * Assert that the first nmissing entries from the array of used 110010105Sadam.leventhal@sun.com * columns correspond to parity columns and that subsequent entries 110110105Sadam.leventhal@sun.com * correspond to data columns. 110210105Sadam.leventhal@sun.com */ 110310105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 110410105Sadam.leventhal@sun.com ASSERT3S(used[i], <, rm->rm_firstdatacol); 110510105Sadam.leventhal@sun.com } 110610105Sadam.leventhal@sun.com for (; i < n; i++) { 110710105Sadam.leventhal@sun.com ASSERT3S(used[i], >=, rm->rm_firstdatacol); 110810105Sadam.leventhal@sun.com } 110910105Sadam.leventhal@sun.com 111010105Sadam.leventhal@sun.com /* 111110105Sadam.leventhal@sun.com * First initialize the storage where we'll compute the inverse rows. 111210105Sadam.leventhal@sun.com */ 111310105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 111410105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 111510105Sadam.leventhal@sun.com invrows[i][j] = (i == j) ? 1 : 0; 111610105Sadam.leventhal@sun.com } 111710105Sadam.leventhal@sun.com } 111810105Sadam.leventhal@sun.com 111910105Sadam.leventhal@sun.com /* 112010105Sadam.leventhal@sun.com * Subtract all trivial rows from the rows of consequence. 112110105Sadam.leventhal@sun.com */ 112210105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 112310105Sadam.leventhal@sun.com for (j = nmissing; j < n; j++) { 112410105Sadam.leventhal@sun.com ASSERT3U(used[j], >=, rm->rm_firstdatacol); 112510105Sadam.leventhal@sun.com jj = used[j] - rm->rm_firstdatacol; 112610105Sadam.leventhal@sun.com ASSERT3S(jj, <, n); 112710105Sadam.leventhal@sun.com invrows[i][j] = rows[i][jj]; 112810105Sadam.leventhal@sun.com rows[i][jj] = 0; 112910105Sadam.leventhal@sun.com } 113010105Sadam.leventhal@sun.com } 113110105Sadam.leventhal@sun.com 113210105Sadam.leventhal@sun.com /* 113310105Sadam.leventhal@sun.com * For each of the rows of interest, we must normalize it and subtract 113410105Sadam.leventhal@sun.com * a multiple of it from the other rows. 113510105Sadam.leventhal@sun.com */ 113610105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 113710105Sadam.leventhal@sun.com for (j = 0; j < missing[i]; j++) { 113810105Sadam.leventhal@sun.com ASSERT3U(rows[i][j], ==, 0); 113910105Sadam.leventhal@sun.com } 114010105Sadam.leventhal@sun.com ASSERT3U(rows[i][missing[i]], !=, 0); 114110105Sadam.leventhal@sun.com 114210105Sadam.leventhal@sun.com /* 114310105Sadam.leventhal@sun.com * Compute the inverse of the first element and multiply each 114410105Sadam.leventhal@sun.com * element in the row by that value. 114510105Sadam.leventhal@sun.com */ 114610105Sadam.leventhal@sun.com log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 114710105Sadam.leventhal@sun.com 114810105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 114910105Sadam.leventhal@sun.com rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 115010105Sadam.leventhal@sun.com invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 115110105Sadam.leventhal@sun.com } 115210105Sadam.leventhal@sun.com 115310105Sadam.leventhal@sun.com for (ii = 0; ii < nmissing; ii++) { 115410105Sadam.leventhal@sun.com if (i == ii) 115510105Sadam.leventhal@sun.com continue; 115610105Sadam.leventhal@sun.com 115710105Sadam.leventhal@sun.com ASSERT3U(rows[ii][missing[i]], !=, 0); 115810105Sadam.leventhal@sun.com 115910105Sadam.leventhal@sun.com log = vdev_raidz_log2[rows[ii][missing[i]]]; 116010105Sadam.leventhal@sun.com 116110105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 116210105Sadam.leventhal@sun.com rows[ii][j] ^= 116310105Sadam.leventhal@sun.com vdev_raidz_exp2(rows[i][j], log); 116410105Sadam.leventhal@sun.com invrows[ii][j] ^= 116510105Sadam.leventhal@sun.com vdev_raidz_exp2(invrows[i][j], log); 116610105Sadam.leventhal@sun.com } 116710105Sadam.leventhal@sun.com } 116810105Sadam.leventhal@sun.com } 116910105Sadam.leventhal@sun.com 117010105Sadam.leventhal@sun.com /* 117110105Sadam.leventhal@sun.com * Verify that the data that is left in the rows are properly part of 117210105Sadam.leventhal@sun.com * an identity matrix. 117310105Sadam.leventhal@sun.com */ 117410105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 117510105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 117610105Sadam.leventhal@sun.com if (j == missing[i]) { 117710105Sadam.leventhal@sun.com ASSERT3U(rows[i][j], ==, 1); 117810105Sadam.leventhal@sun.com } else { 117910105Sadam.leventhal@sun.com ASSERT3U(rows[i][j], ==, 0); 118010105Sadam.leventhal@sun.com } 118110105Sadam.leventhal@sun.com } 118210105Sadam.leventhal@sun.com } 118310105Sadam.leventhal@sun.com } 118410105Sadam.leventhal@sun.com 118510105Sadam.leventhal@sun.com static void 118610105Sadam.leventhal@sun.com vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 118710105Sadam.leventhal@sun.com int *missing, uint8_t **invrows, const uint8_t *used) 118810105Sadam.leventhal@sun.com { 118910105Sadam.leventhal@sun.com int i, j, x, cc, c; 119010105Sadam.leventhal@sun.com uint8_t *src; 119110105Sadam.leventhal@sun.com uint64_t ccount; 119210105Sadam.leventhal@sun.com uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 119310105Sadam.leventhal@sun.com uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 119410105Sadam.leventhal@sun.com uint8_t log, val; 119510105Sadam.leventhal@sun.com int ll; 119610105Sadam.leventhal@sun.com uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 119710105Sadam.leventhal@sun.com uint8_t *p, *pp; 119810105Sadam.leventhal@sun.com size_t psize; 119910105Sadam.leventhal@sun.com 120010105Sadam.leventhal@sun.com psize = sizeof (invlog[0][0]) * n * nmissing; 120110105Sadam.leventhal@sun.com p = kmem_alloc(psize, KM_SLEEP); 120210105Sadam.leventhal@sun.com 120310105Sadam.leventhal@sun.com for (pp = p, i = 0; i < nmissing; i++) { 120410105Sadam.leventhal@sun.com invlog[i] = pp; 120510105Sadam.leventhal@sun.com pp += n; 120610105Sadam.leventhal@sun.com } 120710105Sadam.leventhal@sun.com 120810105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 120910105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 121010105Sadam.leventhal@sun.com ASSERT3U(invrows[i][j], !=, 0); 121110105Sadam.leventhal@sun.com invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 121210105Sadam.leventhal@sun.com } 121310105Sadam.leventhal@sun.com } 121410105Sadam.leventhal@sun.com 121510105Sadam.leventhal@sun.com for (i = 0; i < n; i++) { 121610105Sadam.leventhal@sun.com c = used[i]; 121710105Sadam.leventhal@sun.com ASSERT3U(c, <, rm->rm_cols); 121810105Sadam.leventhal@sun.com 121910105Sadam.leventhal@sun.com src = rm->rm_col[c].rc_data; 122010105Sadam.leventhal@sun.com ccount = rm->rm_col[c].rc_size; 122110105Sadam.leventhal@sun.com for (j = 0; j < nmissing; j++) { 122210105Sadam.leventhal@sun.com cc = missing[j] + rm->rm_firstdatacol; 122310105Sadam.leventhal@sun.com ASSERT3U(cc, >=, rm->rm_firstdatacol); 122410105Sadam.leventhal@sun.com ASSERT3U(cc, <, rm->rm_cols); 122510105Sadam.leventhal@sun.com ASSERT3U(cc, !=, c); 122610105Sadam.leventhal@sun.com 122710105Sadam.leventhal@sun.com dst[j] = rm->rm_col[cc].rc_data; 122810105Sadam.leventhal@sun.com dcount[j] = rm->rm_col[cc].rc_size; 122910105Sadam.leventhal@sun.com } 123010105Sadam.leventhal@sun.com 123110105Sadam.leventhal@sun.com ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 123210105Sadam.leventhal@sun.com 123310105Sadam.leventhal@sun.com for (x = 0; x < ccount; x++, src++) { 123410105Sadam.leventhal@sun.com if (*src != 0) 123510105Sadam.leventhal@sun.com log = vdev_raidz_log2[*src]; 123610105Sadam.leventhal@sun.com 123710105Sadam.leventhal@sun.com for (cc = 0; cc < nmissing; cc++) { 123810105Sadam.leventhal@sun.com if (x >= dcount[cc]) 123910105Sadam.leventhal@sun.com continue; 124010105Sadam.leventhal@sun.com 124110105Sadam.leventhal@sun.com if (*src == 0) { 124210105Sadam.leventhal@sun.com val = 0; 124310105Sadam.leventhal@sun.com } else { 124410105Sadam.leventhal@sun.com if ((ll = log + invlog[cc][i]) >= 255) 124510105Sadam.leventhal@sun.com ll -= 255; 124610105Sadam.leventhal@sun.com val = vdev_raidz_pow2[ll]; 124710105Sadam.leventhal@sun.com } 124810105Sadam.leventhal@sun.com 124910105Sadam.leventhal@sun.com if (i == 0) 125010105Sadam.leventhal@sun.com dst[cc][x] = val; 125110105Sadam.leventhal@sun.com else 125210105Sadam.leventhal@sun.com dst[cc][x] ^= val; 125310105Sadam.leventhal@sun.com } 125410105Sadam.leventhal@sun.com } 125510105Sadam.leventhal@sun.com } 125610105Sadam.leventhal@sun.com 125710105Sadam.leventhal@sun.com kmem_free(p, psize); 125810105Sadam.leventhal@sun.com } 125910105Sadam.leventhal@sun.com 126010105Sadam.leventhal@sun.com static int 126110105Sadam.leventhal@sun.com vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 126210105Sadam.leventhal@sun.com { 126310105Sadam.leventhal@sun.com int n, i, c, t, tt; 126410105Sadam.leventhal@sun.com int nmissing_rows; 126510105Sadam.leventhal@sun.com int missing_rows[VDEV_RAIDZ_MAXPARITY]; 126610105Sadam.leventhal@sun.com int parity_map[VDEV_RAIDZ_MAXPARITY]; 126710105Sadam.leventhal@sun.com 126810105Sadam.leventhal@sun.com uint8_t *p, *pp; 126910105Sadam.leventhal@sun.com size_t psize; 127010105Sadam.leventhal@sun.com 127110105Sadam.leventhal@sun.com uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 127210105Sadam.leventhal@sun.com uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 127310105Sadam.leventhal@sun.com uint8_t *used; 127410105Sadam.leventhal@sun.com 127510105Sadam.leventhal@sun.com int code = 0; 127610105Sadam.leventhal@sun.com 127710105Sadam.leventhal@sun.com 127810105Sadam.leventhal@sun.com n = rm->rm_cols - rm->rm_firstdatacol; 127910105Sadam.leventhal@sun.com 128010105Sadam.leventhal@sun.com /* 128110105Sadam.leventhal@sun.com * Figure out which data columns are missing. 128210105Sadam.leventhal@sun.com */ 128310105Sadam.leventhal@sun.com nmissing_rows = 0; 128410105Sadam.leventhal@sun.com for (t = 0; t < ntgts; t++) { 128510105Sadam.leventhal@sun.com if (tgts[t] >= rm->rm_firstdatacol) { 128610105Sadam.leventhal@sun.com missing_rows[nmissing_rows++] = 128710105Sadam.leventhal@sun.com tgts[t] - rm->rm_firstdatacol; 128810105Sadam.leventhal@sun.com } 128910105Sadam.leventhal@sun.com } 129010105Sadam.leventhal@sun.com 129110105Sadam.leventhal@sun.com /* 129210105Sadam.leventhal@sun.com * Figure out which parity columns to use to help generate the missing 129310105Sadam.leventhal@sun.com * data columns. 129410105Sadam.leventhal@sun.com */ 129510105Sadam.leventhal@sun.com for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 129610105Sadam.leventhal@sun.com ASSERT(tt < ntgts); 129710105Sadam.leventhal@sun.com ASSERT(c < rm->rm_firstdatacol); 129810105Sadam.leventhal@sun.com 129910105Sadam.leventhal@sun.com /* 130010105Sadam.leventhal@sun.com * Skip any targeted parity columns. 130110105Sadam.leventhal@sun.com */ 130210105Sadam.leventhal@sun.com if (c == tgts[tt]) { 130310105Sadam.leventhal@sun.com tt++; 130410105Sadam.leventhal@sun.com continue; 130510105Sadam.leventhal@sun.com } 130610105Sadam.leventhal@sun.com 130710105Sadam.leventhal@sun.com code |= 1 << c; 130810105Sadam.leventhal@sun.com 130910105Sadam.leventhal@sun.com parity_map[i] = c; 131010105Sadam.leventhal@sun.com i++; 131110105Sadam.leventhal@sun.com } 131210105Sadam.leventhal@sun.com 131310105Sadam.leventhal@sun.com ASSERT(code != 0); 131410105Sadam.leventhal@sun.com ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 131510105Sadam.leventhal@sun.com 131610105Sadam.leventhal@sun.com psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 131710105Sadam.leventhal@sun.com nmissing_rows * n + sizeof (used[0]) * n; 131810105Sadam.leventhal@sun.com p = kmem_alloc(psize, KM_SLEEP); 131910105Sadam.leventhal@sun.com 132010105Sadam.leventhal@sun.com for (pp = p, i = 0; i < nmissing_rows; i++) { 132110105Sadam.leventhal@sun.com rows[i] = pp; 132210105Sadam.leventhal@sun.com pp += n; 132310105Sadam.leventhal@sun.com invrows[i] = pp; 132410105Sadam.leventhal@sun.com pp += n; 132510105Sadam.leventhal@sun.com } 132610105Sadam.leventhal@sun.com used = pp; 132710105Sadam.leventhal@sun.com 132810105Sadam.leventhal@sun.com for (i = 0; i < nmissing_rows; i++) { 132910105Sadam.leventhal@sun.com used[i] = parity_map[i]; 133010105Sadam.leventhal@sun.com } 133110105Sadam.leventhal@sun.com 133210105Sadam.leventhal@sun.com for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 133310105Sadam.leventhal@sun.com if (tt < nmissing_rows && 133410105Sadam.leventhal@sun.com c == missing_rows[tt] + rm->rm_firstdatacol) { 133510105Sadam.leventhal@sun.com tt++; 133610105Sadam.leventhal@sun.com continue; 133710105Sadam.leventhal@sun.com } 133810105Sadam.leventhal@sun.com 133910105Sadam.leventhal@sun.com ASSERT3S(i, <, n); 134010105Sadam.leventhal@sun.com used[i] = c; 134110105Sadam.leventhal@sun.com i++; 134210105Sadam.leventhal@sun.com } 134310105Sadam.leventhal@sun.com 134410105Sadam.leventhal@sun.com /* 134510105Sadam.leventhal@sun.com * Initialize the interesting rows of the matrix. 134610105Sadam.leventhal@sun.com */ 134710105Sadam.leventhal@sun.com vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 134810105Sadam.leventhal@sun.com 134910105Sadam.leventhal@sun.com /* 135010105Sadam.leventhal@sun.com * Invert the matrix. 135110105Sadam.leventhal@sun.com */ 135210105Sadam.leventhal@sun.com vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 135310105Sadam.leventhal@sun.com invrows, used); 135410105Sadam.leventhal@sun.com 135510105Sadam.leventhal@sun.com /* 135610105Sadam.leventhal@sun.com * Reconstruct the missing data using the generated matrix. 135710105Sadam.leventhal@sun.com */ 135810105Sadam.leventhal@sun.com vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 135910105Sadam.leventhal@sun.com invrows, used); 136010105Sadam.leventhal@sun.com 136110105Sadam.leventhal@sun.com kmem_free(p, psize); 136210105Sadam.leventhal@sun.com 136310105Sadam.leventhal@sun.com return (code); 136410105Sadam.leventhal@sun.com } 136510105Sadam.leventhal@sun.com 136610105Sadam.leventhal@sun.com static int 136710105Sadam.leventhal@sun.com vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 136810105Sadam.leventhal@sun.com { 136910105Sadam.leventhal@sun.com int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 137010105Sadam.leventhal@sun.com int ntgts; 137110105Sadam.leventhal@sun.com int i, c; 137210105Sadam.leventhal@sun.com int code; 137310105Sadam.leventhal@sun.com int nbadparity, nbaddata; 137410105Sadam.leventhal@sun.com int parity_valid[VDEV_RAIDZ_MAXPARITY]; 137510105Sadam.leventhal@sun.com 137610105Sadam.leventhal@sun.com /* 137710105Sadam.leventhal@sun.com * The tgts list must already be sorted. 137810105Sadam.leventhal@sun.com */ 137910105Sadam.leventhal@sun.com for (i = 1; i < nt; i++) { 138010105Sadam.leventhal@sun.com ASSERT(t[i] > t[i - 1]); 138110105Sadam.leventhal@sun.com } 138210105Sadam.leventhal@sun.com 138310105Sadam.leventhal@sun.com nbadparity = rm->rm_firstdatacol; 138410105Sadam.leventhal@sun.com nbaddata = rm->rm_cols - nbadparity; 138510105Sadam.leventhal@sun.com ntgts = 0; 138610105Sadam.leventhal@sun.com for (i = 0, c = 0; c < rm->rm_cols; c++) { 138710105Sadam.leventhal@sun.com if (c < rm->rm_firstdatacol) 138810105Sadam.leventhal@sun.com parity_valid[c] = B_FALSE; 138910105Sadam.leventhal@sun.com 139010105Sadam.leventhal@sun.com if (i < nt && c == t[i]) { 139110105Sadam.leventhal@sun.com tgts[ntgts++] = c; 139210105Sadam.leventhal@sun.com i++; 139310105Sadam.leventhal@sun.com } else if (rm->rm_col[c].rc_error != 0) { 139410105Sadam.leventhal@sun.com tgts[ntgts++] = c; 139510105Sadam.leventhal@sun.com } else if (c >= rm->rm_firstdatacol) { 139610105Sadam.leventhal@sun.com nbaddata--; 139710105Sadam.leventhal@sun.com } else { 139810105Sadam.leventhal@sun.com parity_valid[c] = B_TRUE; 139910105Sadam.leventhal@sun.com nbadparity--; 140010105Sadam.leventhal@sun.com } 140110105Sadam.leventhal@sun.com } 140210105Sadam.leventhal@sun.com 140310105Sadam.leventhal@sun.com ASSERT(ntgts >= nt); 140410105Sadam.leventhal@sun.com ASSERT(nbaddata >= 0); 140510105Sadam.leventhal@sun.com ASSERT(nbaddata + nbadparity == ntgts); 140610105Sadam.leventhal@sun.com 140710105Sadam.leventhal@sun.com dt = &tgts[nbadparity]; 140810105Sadam.leventhal@sun.com 140910105Sadam.leventhal@sun.com /* 141010105Sadam.leventhal@sun.com * See if we can use any of our optimized reconstruction routines. 141110105Sadam.leventhal@sun.com */ 141210105Sadam.leventhal@sun.com if (!vdev_raidz_default_to_general) { 141310105Sadam.leventhal@sun.com switch (nbaddata) { 141410105Sadam.leventhal@sun.com case 1: 141510105Sadam.leventhal@sun.com if (parity_valid[VDEV_RAIDZ_P]) 141610105Sadam.leventhal@sun.com return (vdev_raidz_reconstruct_p(rm, dt, 1)); 141710105Sadam.leventhal@sun.com 141810105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol > 1); 141910105Sadam.leventhal@sun.com 142010105Sadam.leventhal@sun.com if (parity_valid[VDEV_RAIDZ_Q]) 142110105Sadam.leventhal@sun.com return (vdev_raidz_reconstruct_q(rm, dt, 1)); 142210105Sadam.leventhal@sun.com 142310105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol > 2); 142410105Sadam.leventhal@sun.com break; 142510105Sadam.leventhal@sun.com 142610105Sadam.leventhal@sun.com case 2: 142710105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol > 1); 142810105Sadam.leventhal@sun.com 142910105Sadam.leventhal@sun.com if (parity_valid[VDEV_RAIDZ_P] && 143010105Sadam.leventhal@sun.com parity_valid[VDEV_RAIDZ_Q]) 143110105Sadam.leventhal@sun.com return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 143210105Sadam.leventhal@sun.com 143310105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol > 2); 143410105Sadam.leventhal@sun.com 143510105Sadam.leventhal@sun.com break; 143610105Sadam.leventhal@sun.com } 143710105Sadam.leventhal@sun.com } 143810105Sadam.leventhal@sun.com 143910105Sadam.leventhal@sun.com code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 144010105Sadam.leventhal@sun.com ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 144110105Sadam.leventhal@sun.com ASSERT(code > 0); 144210105Sadam.leventhal@sun.com return (code); 144310105Sadam.leventhal@sun.com } 14442082Seschrock 1445789Sahrens static int 1446789Sahrens vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) 1447789Sahrens { 144810105Sadam.leventhal@sun.com vdev_t *cvd; 14492082Seschrock uint64_t nparity = vd->vdev_nparity; 145010105Sadam.leventhal@sun.com int c; 1451789Sahrens int lasterror = 0; 1452789Sahrens int numerrors = 0; 1453789Sahrens 14542082Seschrock ASSERT(nparity > 0); 14552082Seschrock 14562082Seschrock if (nparity > VDEV_RAIDZ_MAXPARITY || 14572082Seschrock vd->vdev_children < nparity + 1) { 1458789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1459789Sahrens return (EINVAL); 1460789Sahrens } 1461789Sahrens 14629846SEric.Taylor@Sun.COM vdev_open_children(vd); 1463789Sahrens 146410105Sadam.leventhal@sun.com for (c = 0; c < vd->vdev_children; c++) { 146510105Sadam.leventhal@sun.com cvd = vd->vdev_child[c]; 14669846SEric.Taylor@Sun.COM 146710105Sadam.leventhal@sun.com if (cvd->vdev_open_error != 0) { 14689846SEric.Taylor@Sun.COM lasterror = cvd->vdev_open_error; 1469789Sahrens numerrors++; 1470789Sahrens continue; 1471789Sahrens } 1472789Sahrens 1473789Sahrens *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 14741732Sbonwick *ashift = MAX(*ashift, cvd->vdev_ashift); 1475789Sahrens } 1476789Sahrens 1477789Sahrens *asize *= vd->vdev_children; 1478789Sahrens 14792082Seschrock if (numerrors > nparity) { 1480789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1481789Sahrens return (lasterror); 1482789Sahrens } 1483789Sahrens 1484789Sahrens return (0); 1485789Sahrens } 1486789Sahrens 1487789Sahrens static void 1488789Sahrens vdev_raidz_close(vdev_t *vd) 1489789Sahrens { 149010105Sadam.leventhal@sun.com int c; 149110105Sadam.leventhal@sun.com 149210105Sadam.leventhal@sun.com for (c = 0; c < vd->vdev_children; c++) 1493789Sahrens vdev_close(vd->vdev_child[c]); 1494789Sahrens } 1495789Sahrens 1496789Sahrens static uint64_t 1497789Sahrens vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1498789Sahrens { 1499789Sahrens uint64_t asize; 15001732Sbonwick uint64_t ashift = vd->vdev_top->vdev_ashift; 1501789Sahrens uint64_t cols = vd->vdev_children; 15022082Seschrock uint64_t nparity = vd->vdev_nparity; 1503789Sahrens 15041732Sbonwick asize = ((psize - 1) >> ashift) + 1; 15052082Seschrock asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 15062082Seschrock asize = roundup(asize, nparity + 1) << ashift; 1507789Sahrens 1508789Sahrens return (asize); 1509789Sahrens } 1510789Sahrens 1511789Sahrens static void 1512789Sahrens vdev_raidz_child_done(zio_t *zio) 1513789Sahrens { 1514789Sahrens raidz_col_t *rc = zio->io_private; 1515789Sahrens 1516789Sahrens rc->rc_error = zio->io_error; 1517789Sahrens rc->rc_tried = 1; 1518789Sahrens rc->rc_skipped = 0; 1519789Sahrens } 1520789Sahrens 15215530Sbonwick static int 1522789Sahrens vdev_raidz_io_start(zio_t *zio) 1523789Sahrens { 1524789Sahrens vdev_t *vd = zio->io_vd; 15251732Sbonwick vdev_t *tvd = vd->vdev_top; 1526789Sahrens vdev_t *cvd; 1527789Sahrens blkptr_t *bp = zio->io_bp; 1528789Sahrens raidz_map_t *rm; 1529789Sahrens raidz_col_t *rc; 153010105Sadam.leventhal@sun.com int c, i; 1531789Sahrens 15322082Seschrock rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, 15332082Seschrock vd->vdev_nparity); 1534789Sahrens 15351775Sbillm ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1536789Sahrens 1537789Sahrens if (zio->io_type == ZIO_TYPE_WRITE) { 153810105Sadam.leventhal@sun.com vdev_raidz_generate_parity(rm); 1539789Sahrens 1540789Sahrens for (c = 0; c < rm->rm_cols; c++) { 1541789Sahrens rc = &rm->rm_col[c]; 15422082Seschrock cvd = vd->vdev_child[rc->rc_devidx]; 1543789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1544789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 15457754SJeff.Bonwick@Sun.COM zio->io_type, zio->io_priority, 0, 1546789Sahrens vdev_raidz_child_done, rc)); 1547789Sahrens } 15485530Sbonwick 154910105Sadam.leventhal@sun.com /* 155010105Sadam.leventhal@sun.com * Generate optional I/Os for any skipped sectors to improve 155110105Sadam.leventhal@sun.com * aggregation contiguity. 155210105Sadam.leventhal@sun.com */ 155310450Sadam.leventhal@sun.com for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 155410105Sadam.leventhal@sun.com ASSERT(c <= rm->rm_scols); 155510105Sadam.leventhal@sun.com if (c == rm->rm_scols) 155610105Sadam.leventhal@sun.com c = 0; 155710105Sadam.leventhal@sun.com rc = &rm->rm_col[c]; 155810105Sadam.leventhal@sun.com cvd = vd->vdev_child[rc->rc_devidx]; 155910105Sadam.leventhal@sun.com zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 156010105Sadam.leventhal@sun.com rc->rc_offset + rc->rc_size, NULL, 156110105Sadam.leventhal@sun.com 1 << tvd->vdev_ashift, 156210105Sadam.leventhal@sun.com zio->io_type, zio->io_priority, 156310105Sadam.leventhal@sun.com ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 156410105Sadam.leventhal@sun.com } 156510105Sadam.leventhal@sun.com 15667754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 1567789Sahrens } 1568789Sahrens 1569789Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ); 1570789Sahrens 15712082Seschrock /* 15722082Seschrock * Iterate over the columns in reverse order so that we hit the parity 157310105Sadam.leventhal@sun.com * last -- any errors along the way will force us to read the parity. 15742082Seschrock */ 1575789Sahrens for (c = rm->rm_cols - 1; c >= 0; c--) { 1576789Sahrens rc = &rm->rm_col[c]; 15772082Seschrock cvd = vd->vdev_child[rc->rc_devidx]; 15785329Sgw25295 if (!vdev_readable(cvd)) { 15792082Seschrock if (c >= rm->rm_firstdatacol) 15802082Seschrock rm->rm_missingdata++; 15812082Seschrock else 15822082Seschrock rm->rm_missingparity++; 1583789Sahrens rc->rc_error = ENXIO; 1584789Sahrens rc->rc_tried = 1; /* don't even try */ 1585789Sahrens rc->rc_skipped = 1; 1586789Sahrens continue; 1587789Sahrens } 15888241SJeff.Bonwick@Sun.COM if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) { 15892082Seschrock if (c >= rm->rm_firstdatacol) 15902082Seschrock rm->rm_missingdata++; 15912082Seschrock else 15922082Seschrock rm->rm_missingparity++; 1593789Sahrens rc->rc_error = ESTALE; 1594789Sahrens rc->rc_skipped = 1; 1595789Sahrens continue; 1596789Sahrens } 15972082Seschrock if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 15989434SMark.Musante@Sun.COM (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1599789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1600789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 16017754SJeff.Bonwick@Sun.COM zio->io_type, zio->io_priority, 0, 1602789Sahrens vdev_raidz_child_done, rc)); 1603789Sahrens } 1604789Sahrens } 1605789Sahrens 16067754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 1607789Sahrens } 1608789Sahrens 16091544Seschrock /* 16101544Seschrock * Report a checksum error for a child of a RAID-Z device. 16111544Seschrock */ 16121544Seschrock static void 161310614SJonathan.Adams@Sun.COM raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) 16141544Seschrock { 16152082Seschrock vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 16161544Seschrock 16171544Seschrock if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 161810614SJonathan.Adams@Sun.COM zio_bad_cksum_t zbc; 161910614SJonathan.Adams@Sun.COM raidz_map_t *rm = zio->io_vsd; 162010614SJonathan.Adams@Sun.COM 16211544Seschrock mutex_enter(&vd->vdev_stat_lock); 16221544Seschrock vd->vdev_stat.vs_checksum_errors++; 16231544Seschrock mutex_exit(&vd->vdev_stat_lock); 162410614SJonathan.Adams@Sun.COM 162510614SJonathan.Adams@Sun.COM zbc.zbc_has_cksum = 0; 162610614SJonathan.Adams@Sun.COM zbc.zbc_injected = rm->rm_ecksuminjected; 162710614SJonathan.Adams@Sun.COM 162810614SJonathan.Adams@Sun.COM zfs_ereport_post_checksum(zio->io_spa, vd, zio, 162910614SJonathan.Adams@Sun.COM rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, 163010614SJonathan.Adams@Sun.COM &zbc); 16311544Seschrock } 163210614SJonathan.Adams@Sun.COM } 16331544Seschrock 163410614SJonathan.Adams@Sun.COM /* 163510614SJonathan.Adams@Sun.COM * We keep track of whether or not there were any injected errors, so that 163610614SJonathan.Adams@Sun.COM * any ereports we generate can note it. 163710614SJonathan.Adams@Sun.COM */ 163810614SJonathan.Adams@Sun.COM static int 163910614SJonathan.Adams@Sun.COM raidz_checksum_verify(zio_t *zio) 164010614SJonathan.Adams@Sun.COM { 164110614SJonathan.Adams@Sun.COM zio_bad_cksum_t zbc; 164210614SJonathan.Adams@Sun.COM raidz_map_t *rm = zio->io_vsd; 164310614SJonathan.Adams@Sun.COM 164410614SJonathan.Adams@Sun.COM int ret = zio_checksum_error(zio, &zbc); 164510614SJonathan.Adams@Sun.COM if (ret != 0 && zbc.zbc_injected != 0) 164610614SJonathan.Adams@Sun.COM rm->rm_ecksuminjected = 1; 164710614SJonathan.Adams@Sun.COM 164810614SJonathan.Adams@Sun.COM return (ret); 16491544Seschrock } 16501544Seschrock 16512082Seschrock /* 16522082Seschrock * Generate the parity from the data columns. If we tried and were able to 16532082Seschrock * read the parity without error, verify that the generated parity matches the 16542082Seschrock * data we read. If it doesn't, we fire off a checksum error. Return the 16552082Seschrock * number such failures. 16562082Seschrock */ 16572082Seschrock static int 16582082Seschrock raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 16592082Seschrock { 16602082Seschrock void *orig[VDEV_RAIDZ_MAXPARITY]; 16612082Seschrock int c, ret = 0; 16622082Seschrock raidz_col_t *rc; 16632082Seschrock 16642082Seschrock for (c = 0; c < rm->rm_firstdatacol; c++) { 16652082Seschrock rc = &rm->rm_col[c]; 16662082Seschrock if (!rc->rc_tried || rc->rc_error != 0) 16672082Seschrock continue; 16682082Seschrock orig[c] = zio_buf_alloc(rc->rc_size); 16692082Seschrock bcopy(rc->rc_data, orig[c], rc->rc_size); 16702082Seschrock } 16712082Seschrock 167210105Sadam.leventhal@sun.com vdev_raidz_generate_parity(rm); 16732082Seschrock 16742082Seschrock for (c = 0; c < rm->rm_firstdatacol; c++) { 16752082Seschrock rc = &rm->rm_col[c]; 16762082Seschrock if (!rc->rc_tried || rc->rc_error != 0) 16772082Seschrock continue; 16782082Seschrock if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 167910614SJonathan.Adams@Sun.COM raidz_checksum_error(zio, rc, orig[c]); 16802082Seschrock rc->rc_error = ECKSUM; 16812082Seschrock ret++; 16822082Seschrock } 16832082Seschrock zio_buf_free(orig[c], rc->rc_size); 16842082Seschrock } 16852082Seschrock 16862082Seschrock return (ret); 16872082Seschrock } 16882082Seschrock 168910105Sadam.leventhal@sun.com /* 169010105Sadam.leventhal@sun.com * Keep statistics on all the ways that we used parity to correct data. 169110105Sadam.leventhal@sun.com */ 169210105Sadam.leventhal@sun.com static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; 16931544Seschrock 16945530Sbonwick static int 16957754SJeff.Bonwick@Sun.COM vdev_raidz_worst_error(raidz_map_t *rm) 16967754SJeff.Bonwick@Sun.COM { 16977754SJeff.Bonwick@Sun.COM int error = 0; 16987754SJeff.Bonwick@Sun.COM 16997754SJeff.Bonwick@Sun.COM for (int c = 0; c < rm->rm_cols; c++) 17007754SJeff.Bonwick@Sun.COM error = zio_worst_error(error, rm->rm_col[c].rc_error); 17017754SJeff.Bonwick@Sun.COM 17027754SJeff.Bonwick@Sun.COM return (error); 17037754SJeff.Bonwick@Sun.COM } 17047754SJeff.Bonwick@Sun.COM 170510105Sadam.leventhal@sun.com /* 170610105Sadam.leventhal@sun.com * Iterate over all combinations of bad data and attempt a reconstruction. 170710105Sadam.leventhal@sun.com * Note that the algorithm below is non-optimal because it doesn't take into 170810105Sadam.leventhal@sun.com * account how reconstruction is actually performed. For example, with 170910105Sadam.leventhal@sun.com * triple-parity RAID-Z the reconstruction procedure is the same if column 4 171010105Sadam.leventhal@sun.com * is targeted as invalid as if columns 1 and 4 are targeted since in both 171110105Sadam.leventhal@sun.com * cases we'd only use parity information in column 0. 171210105Sadam.leventhal@sun.com */ 171310105Sadam.leventhal@sun.com static int 171410105Sadam.leventhal@sun.com vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 171510105Sadam.leventhal@sun.com { 171610105Sadam.leventhal@sun.com raidz_map_t *rm = zio->io_vsd; 171710105Sadam.leventhal@sun.com raidz_col_t *rc; 171810105Sadam.leventhal@sun.com void *orig[VDEV_RAIDZ_MAXPARITY]; 171910105Sadam.leventhal@sun.com int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 172010105Sadam.leventhal@sun.com int *tgts = &tstore[1]; 172110105Sadam.leventhal@sun.com int current, next, i, c, n; 172210105Sadam.leventhal@sun.com int code, ret = 0; 172310105Sadam.leventhal@sun.com 172410105Sadam.leventhal@sun.com ASSERT(total_errors < rm->rm_firstdatacol); 172510105Sadam.leventhal@sun.com 172610105Sadam.leventhal@sun.com /* 172710105Sadam.leventhal@sun.com * This simplifies one edge condition. 172810105Sadam.leventhal@sun.com */ 172910105Sadam.leventhal@sun.com tgts[-1] = -1; 173010105Sadam.leventhal@sun.com 173110105Sadam.leventhal@sun.com for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 173210105Sadam.leventhal@sun.com /* 173310105Sadam.leventhal@sun.com * Initialize the targets array by finding the first n columns 173410105Sadam.leventhal@sun.com * that contain no error. 173510105Sadam.leventhal@sun.com * 173610105Sadam.leventhal@sun.com * If there were no data errors, we need to ensure that we're 173710105Sadam.leventhal@sun.com * always explicitly attempting to reconstruct at least one 173810105Sadam.leventhal@sun.com * data column. To do this, we simply push the highest target 173910105Sadam.leventhal@sun.com * up into the data columns. 174010105Sadam.leventhal@sun.com */ 174110105Sadam.leventhal@sun.com for (c = 0, i = 0; i < n; i++) { 174210105Sadam.leventhal@sun.com if (i == n - 1 && data_errors == 0 && 174310105Sadam.leventhal@sun.com c < rm->rm_firstdatacol) { 174410105Sadam.leventhal@sun.com c = rm->rm_firstdatacol; 174510105Sadam.leventhal@sun.com } 174610105Sadam.leventhal@sun.com 174710105Sadam.leventhal@sun.com while (rm->rm_col[c].rc_error != 0) { 174810105Sadam.leventhal@sun.com c++; 174910105Sadam.leventhal@sun.com ASSERT3S(c, <, rm->rm_cols); 175010105Sadam.leventhal@sun.com } 175110105Sadam.leventhal@sun.com 175210105Sadam.leventhal@sun.com tgts[i] = c++; 175310105Sadam.leventhal@sun.com } 175410105Sadam.leventhal@sun.com 175510105Sadam.leventhal@sun.com /* 175610105Sadam.leventhal@sun.com * Setting tgts[n] simplifies the other edge condition. 175710105Sadam.leventhal@sun.com */ 175810105Sadam.leventhal@sun.com tgts[n] = rm->rm_cols; 175910105Sadam.leventhal@sun.com 176010105Sadam.leventhal@sun.com /* 176110105Sadam.leventhal@sun.com * These buffers were allocated in previous iterations. 176210105Sadam.leventhal@sun.com */ 176310105Sadam.leventhal@sun.com for (i = 0; i < n - 1; i++) { 176410105Sadam.leventhal@sun.com ASSERT(orig[i] != NULL); 176510105Sadam.leventhal@sun.com } 176610105Sadam.leventhal@sun.com 176710105Sadam.leventhal@sun.com orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); 176810105Sadam.leventhal@sun.com 176910105Sadam.leventhal@sun.com current = 0; 177010105Sadam.leventhal@sun.com next = tgts[current]; 177110105Sadam.leventhal@sun.com 177210105Sadam.leventhal@sun.com while (current != n) { 177310105Sadam.leventhal@sun.com tgts[current] = next; 177410105Sadam.leventhal@sun.com current = 0; 177510105Sadam.leventhal@sun.com 177610105Sadam.leventhal@sun.com /* 177710105Sadam.leventhal@sun.com * Save off the original data that we're going to 177810105Sadam.leventhal@sun.com * attempt to reconstruct. 177910105Sadam.leventhal@sun.com */ 178010105Sadam.leventhal@sun.com for (i = 0; i < n; i++) { 178110105Sadam.leventhal@sun.com ASSERT(orig[i] != NULL); 178210105Sadam.leventhal@sun.com c = tgts[i]; 178310105Sadam.leventhal@sun.com ASSERT3S(c, >=, 0); 178410105Sadam.leventhal@sun.com ASSERT3S(c, <, rm->rm_cols); 178510105Sadam.leventhal@sun.com rc = &rm->rm_col[c]; 178610105Sadam.leventhal@sun.com bcopy(rc->rc_data, orig[i], rc->rc_size); 178710105Sadam.leventhal@sun.com } 178810105Sadam.leventhal@sun.com 178910105Sadam.leventhal@sun.com /* 179010105Sadam.leventhal@sun.com * Attempt a reconstruction and exit the outer loop on 179110105Sadam.leventhal@sun.com * success. 179210105Sadam.leventhal@sun.com */ 179310105Sadam.leventhal@sun.com code = vdev_raidz_reconstruct(rm, tgts, n); 179410614SJonathan.Adams@Sun.COM if (raidz_checksum_verify(zio) == 0) { 179510105Sadam.leventhal@sun.com atomic_inc_64(&raidz_corrected[code]); 179610105Sadam.leventhal@sun.com 179710105Sadam.leventhal@sun.com for (i = 0; i < n; i++) { 179810105Sadam.leventhal@sun.com c = tgts[i]; 179910105Sadam.leventhal@sun.com rc = &rm->rm_col[c]; 180010105Sadam.leventhal@sun.com ASSERT(rc->rc_error == 0); 180110614SJonathan.Adams@Sun.COM if (rc->rc_tried) 180210614SJonathan.Adams@Sun.COM raidz_checksum_error(zio, rc, 180310614SJonathan.Adams@Sun.COM orig[i]); 180410105Sadam.leventhal@sun.com rc->rc_error = ECKSUM; 180510105Sadam.leventhal@sun.com } 180610105Sadam.leventhal@sun.com 180710105Sadam.leventhal@sun.com ret = code; 180810105Sadam.leventhal@sun.com goto done; 180910105Sadam.leventhal@sun.com } 181010105Sadam.leventhal@sun.com 181110105Sadam.leventhal@sun.com /* 181210105Sadam.leventhal@sun.com * Restore the original data. 181310105Sadam.leventhal@sun.com */ 181410105Sadam.leventhal@sun.com for (i = 0; i < n; i++) { 181510105Sadam.leventhal@sun.com c = tgts[i]; 181610105Sadam.leventhal@sun.com rc = &rm->rm_col[c]; 181710105Sadam.leventhal@sun.com bcopy(orig[i], rc->rc_data, rc->rc_size); 181810105Sadam.leventhal@sun.com } 181910105Sadam.leventhal@sun.com 182010105Sadam.leventhal@sun.com do { 182110105Sadam.leventhal@sun.com /* 182210105Sadam.leventhal@sun.com * Find the next valid column after the current 182310105Sadam.leventhal@sun.com * position.. 182410105Sadam.leventhal@sun.com */ 182510105Sadam.leventhal@sun.com for (next = tgts[current] + 1; 182610105Sadam.leventhal@sun.com next < rm->rm_cols && 182710105Sadam.leventhal@sun.com rm->rm_col[next].rc_error != 0; next++) 182810105Sadam.leventhal@sun.com continue; 182910105Sadam.leventhal@sun.com 183010105Sadam.leventhal@sun.com ASSERT(next <= tgts[current + 1]); 183110105Sadam.leventhal@sun.com 183210105Sadam.leventhal@sun.com /* 183310105Sadam.leventhal@sun.com * If that spot is available, we're done here. 183410105Sadam.leventhal@sun.com */ 183510105Sadam.leventhal@sun.com if (next != tgts[current + 1]) 183610105Sadam.leventhal@sun.com break; 183710105Sadam.leventhal@sun.com 183810105Sadam.leventhal@sun.com /* 183910105Sadam.leventhal@sun.com * Otherwise, find the next valid column after 184010105Sadam.leventhal@sun.com * the previous position. 184110105Sadam.leventhal@sun.com */ 184210105Sadam.leventhal@sun.com for (c = tgts[current - 1] + 1; 184310105Sadam.leventhal@sun.com rm->rm_col[c].rc_error != 0; c++) 184410105Sadam.leventhal@sun.com continue; 184510105Sadam.leventhal@sun.com 184610105Sadam.leventhal@sun.com tgts[current] = c; 184710105Sadam.leventhal@sun.com current++; 184810105Sadam.leventhal@sun.com 184910105Sadam.leventhal@sun.com } while (current != n); 185010105Sadam.leventhal@sun.com } 185110105Sadam.leventhal@sun.com } 185210105Sadam.leventhal@sun.com n--; 185310105Sadam.leventhal@sun.com done: 185410105Sadam.leventhal@sun.com for (i = 0; i < n; i++) { 185510105Sadam.leventhal@sun.com zio_buf_free(orig[i], rm->rm_col[0].rc_size); 185610105Sadam.leventhal@sun.com } 185710105Sadam.leventhal@sun.com 185810105Sadam.leventhal@sun.com return (ret); 185910105Sadam.leventhal@sun.com } 186010105Sadam.leventhal@sun.com 18617754SJeff.Bonwick@Sun.COM static void 1862789Sahrens vdev_raidz_io_done(zio_t *zio) 1863789Sahrens { 1864789Sahrens vdev_t *vd = zio->io_vd; 1865789Sahrens vdev_t *cvd; 1866789Sahrens raidz_map_t *rm = zio->io_vsd; 186710105Sadam.leventhal@sun.com raidz_col_t *rc; 1868789Sahrens int unexpected_errors = 0; 18692082Seschrock int parity_errors = 0; 18703456Sahl int parity_untried = 0; 18712082Seschrock int data_errors = 0; 18727754SJeff.Bonwick@Sun.COM int total_errors = 0; 187310105Sadam.leventhal@sun.com int n, c; 187410105Sadam.leventhal@sun.com int tgts[VDEV_RAIDZ_MAXPARITY]; 187510105Sadam.leventhal@sun.com int code; 1876789Sahrens 18771775Sbillm ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 1878789Sahrens 18792082Seschrock ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 18802082Seschrock ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 18812082Seschrock 1882789Sahrens for (c = 0; c < rm->rm_cols; c++) { 1883789Sahrens rc = &rm->rm_col[c]; 1884789Sahrens 1885789Sahrens if (rc->rc_error) { 18867754SJeff.Bonwick@Sun.COM ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 18872082Seschrock 18882082Seschrock if (c < rm->rm_firstdatacol) 18892082Seschrock parity_errors++; 18902082Seschrock else 18912082Seschrock data_errors++; 18922082Seschrock 1893789Sahrens if (!rc->rc_skipped) 1894789Sahrens unexpected_errors++; 18952082Seschrock 18967754SJeff.Bonwick@Sun.COM total_errors++; 18973456Sahl } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 18983456Sahl parity_untried++; 1899789Sahrens } 1900789Sahrens } 1901789Sahrens 1902789Sahrens if (zio->io_type == ZIO_TYPE_WRITE) { 1903789Sahrens /* 19047754SJeff.Bonwick@Sun.COM * XXX -- for now, treat partial writes as a success. 19057754SJeff.Bonwick@Sun.COM * (If we couldn't write enough columns to reconstruct 19067754SJeff.Bonwick@Sun.COM * the data, the I/O failed. Otherwise, good enough.) 19077754SJeff.Bonwick@Sun.COM * 19087754SJeff.Bonwick@Sun.COM * Now that we support write reallocation, it would be better 19097754SJeff.Bonwick@Sun.COM * to treat partial failure as real failure unless there are 19107754SJeff.Bonwick@Sun.COM * no non-degraded top-level vdevs left, and not update DTLs 19117754SJeff.Bonwick@Sun.COM * if we intend to reallocate. 1912789Sahrens */ 1913789Sahrens /* XXPOLICY */ 19147754SJeff.Bonwick@Sun.COM if (total_errors > rm->rm_firstdatacol) 19157754SJeff.Bonwick@Sun.COM zio->io_error = vdev_raidz_worst_error(rm); 1916789Sahrens 19177754SJeff.Bonwick@Sun.COM return; 1918789Sahrens } 1919789Sahrens 1920789Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ); 19212082Seschrock /* 19222082Seschrock * There are three potential phases for a read: 19232082Seschrock * 1. produce valid data from the columns read 19242082Seschrock * 2. read all disks and try again 19252082Seschrock * 3. perform combinatorial reconstruction 19262082Seschrock * 19272082Seschrock * Each phase is progressively both more expensive and less likely to 19282082Seschrock * occur. If we encounter more errors than we can repair or all phases 19292082Seschrock * fail, we have no choice but to return an error. 19302082Seschrock */ 1931789Sahrens 1932789Sahrens /* 19332082Seschrock * If the number of errors we saw was correctable -- less than or equal 19343456Sahl * to the number of parity disks read -- attempt to produce data that 19353456Sahl * has a valid checksum. Naturally, this case applies in the absence of 19363456Sahl * any errors. 1937789Sahrens */ 19387754SJeff.Bonwick@Sun.COM if (total_errors <= rm->rm_firstdatacol - parity_untried) { 193910105Sadam.leventhal@sun.com if (data_errors == 0) { 194010614SJonathan.Adams@Sun.COM if (raidz_checksum_verify(zio) == 0) { 19414034Sahl /* 19424034Sahl * If we read parity information (unnecessarily 19434034Sahl * as it happens since no reconstruction was 19444034Sahl * needed) regenerate and verify the parity. 19454034Sahl * We also regenerate parity when resilvering 19464034Sahl * so we can write it out to the failed device 19474034Sahl * later. 19484034Sahl */ 19493456Sahl if (parity_errors + parity_untried < 19504034Sahl rm->rm_firstdatacol || 19514034Sahl (zio->io_flags & ZIO_FLAG_RESILVER)) { 19523456Sahl n = raidz_parity_verify(zio, rm); 19533456Sahl unexpected_errors += n; 19543456Sahl ASSERT(parity_errors + n <= 19553456Sahl rm->rm_firstdatacol); 19563456Sahl } 19572082Seschrock goto done; 19582082Seschrock } 195910105Sadam.leventhal@sun.com } else { 19603456Sahl /* 19613456Sahl * We either attempt to read all the parity columns or 19623456Sahl * none of them. If we didn't try to read parity, we 19633456Sahl * wouldn't be here in the correctable case. There must 19643456Sahl * also have been fewer parity errors than parity 19653456Sahl * columns or, again, we wouldn't be in this code path. 19663456Sahl */ 19673456Sahl ASSERT(parity_untried == 0); 19682082Seschrock ASSERT(parity_errors < rm->rm_firstdatacol); 19692082Seschrock 19702082Seschrock /* 197110105Sadam.leventhal@sun.com * Identify the data columns that reported an error. 19722082Seschrock */ 197310105Sadam.leventhal@sun.com n = 0; 19742082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 19752082Seschrock rc = &rm->rm_col[c]; 197610105Sadam.leventhal@sun.com if (rc->rc_error != 0) { 197710105Sadam.leventhal@sun.com ASSERT(n < VDEV_RAIDZ_MAXPARITY); 197810105Sadam.leventhal@sun.com tgts[n++] = c; 197910105Sadam.leventhal@sun.com } 19802082Seschrock } 19812082Seschrock 198210105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol >= n); 198310105Sadam.leventhal@sun.com 198410105Sadam.leventhal@sun.com code = vdev_raidz_reconstruct(rm, tgts, n); 19852082Seschrock 198610614SJonathan.Adams@Sun.COM if (raidz_checksum_verify(zio) == 0) { 198710105Sadam.leventhal@sun.com atomic_inc_64(&raidz_corrected[code]); 1988789Sahrens 19892082Seschrock /* 199010105Sadam.leventhal@sun.com * If we read more parity disks than were used 199110105Sadam.leventhal@sun.com * for reconstruction, confirm that the other 199210105Sadam.leventhal@sun.com * parity disks produced correct data. This 199310105Sadam.leventhal@sun.com * routine is suboptimal in that it regenerates 199410105Sadam.leventhal@sun.com * the parity that we already used in addition 199510105Sadam.leventhal@sun.com * to the parity that we're attempting to 199610105Sadam.leventhal@sun.com * verify, but this should be a relatively 199710105Sadam.leventhal@sun.com * uncommon case, and can be optimized if it 199810105Sadam.leventhal@sun.com * becomes a problem. Note that we regenerate 199910105Sadam.leventhal@sun.com * parity when resilvering so we can write it 200010105Sadam.leventhal@sun.com * out to failed devices later. 20012082Seschrock */ 200210105Sadam.leventhal@sun.com if (parity_errors < rm->rm_firstdatacol - n || 20034034Sahl (zio->io_flags & ZIO_FLAG_RESILVER)) { 20042082Seschrock n = raidz_parity_verify(zio, rm); 20052082Seschrock unexpected_errors += n; 20062082Seschrock ASSERT(parity_errors + n <= 20072082Seschrock rm->rm_firstdatacol); 20082082Seschrock } 20092082Seschrock 20102082Seschrock goto done; 20112082Seschrock } 2012789Sahrens } 2013789Sahrens } 2014789Sahrens 2015789Sahrens /* 20162082Seschrock * This isn't a typical situation -- either we got a read error or 20172082Seschrock * a child silently returned bad data. Read every block so we can 20182082Seschrock * try again with as much data and parity as we can track down. If 20192082Seschrock * we've already been through once before, all children will be marked 20202082Seschrock * as tried so we'll proceed to combinatorial reconstruction. 2021789Sahrens */ 2022789Sahrens unexpected_errors = 1; 20232082Seschrock rm->rm_missingdata = 0; 20242082Seschrock rm->rm_missingparity = 0; 2025789Sahrens 20262082Seschrock for (c = 0; c < rm->rm_cols; c++) { 20272082Seschrock if (rm->rm_col[c].rc_tried) 20282082Seschrock continue; 2029789Sahrens 2030789Sahrens zio_vdev_io_redone(zio); 20312082Seschrock do { 2032789Sahrens rc = &rm->rm_col[c]; 2033789Sahrens if (rc->rc_tried) 2034789Sahrens continue; 2035789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, 20362082Seschrock vd->vdev_child[rc->rc_devidx], 2037789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 20387754SJeff.Bonwick@Sun.COM zio->io_type, zio->io_priority, 0, 2039789Sahrens vdev_raidz_child_done, rc)); 20402082Seschrock } while (++c < rm->rm_cols); 20415530Sbonwick 20427754SJeff.Bonwick@Sun.COM return; 2043789Sahrens } 2044789Sahrens 2045789Sahrens /* 20462082Seschrock * At this point we've attempted to reconstruct the data given the 20472082Seschrock * errors we detected, and we've attempted to read all columns. There 20482082Seschrock * must, therefore, be one or more additional problems -- silent errors 20492082Seschrock * resulting in invalid data rather than explicit I/O errors resulting 205010105Sadam.leventhal@sun.com * in absent data. We check if there is enough additional data to 205110105Sadam.leventhal@sun.com * possibly reconstruct the data and then perform combinatorial 205210105Sadam.leventhal@sun.com * reconstruction over all possible combinations. If that fails, 205310105Sadam.leventhal@sun.com * we're cooked. 2054789Sahrens */ 205510614SJonathan.Adams@Sun.COM if (total_errors > rm->rm_firstdatacol) { 20567754SJeff.Bonwick@Sun.COM zio->io_error = vdev_raidz_worst_error(rm); 20572082Seschrock 205810614SJonathan.Adams@Sun.COM } else if (total_errors < rm->rm_firstdatacol && 205910614SJonathan.Adams@Sun.COM (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { 20602082Seschrock /* 206110105Sadam.leventhal@sun.com * If we didn't use all the available parity for the 206210105Sadam.leventhal@sun.com * combinatorial reconstruction, verify that the remaining 206310105Sadam.leventhal@sun.com * parity is correct. 20642082Seschrock */ 206510105Sadam.leventhal@sun.com if (code != (1 << rm->rm_firstdatacol) - 1) 206610105Sadam.leventhal@sun.com (void) raidz_parity_verify(zio, rm); 206710105Sadam.leventhal@sun.com } else { 206810105Sadam.leventhal@sun.com /* 206910614SJonathan.Adams@Sun.COM * We're here because either: 207010614SJonathan.Adams@Sun.COM * 207110614SJonathan.Adams@Sun.COM * total_errors == rm_first_datacol, or 207210614SJonathan.Adams@Sun.COM * vdev_raidz_combrec() failed 207310614SJonathan.Adams@Sun.COM * 207410614SJonathan.Adams@Sun.COM * In either case, there is enough bad data to prevent 207510614SJonathan.Adams@Sun.COM * reconstruction. 207610614SJonathan.Adams@Sun.COM * 207710614SJonathan.Adams@Sun.COM * Start checksum ereports for all children which haven't 207810614SJonathan.Adams@Sun.COM * failed. 207910105Sadam.leventhal@sun.com */ 208010105Sadam.leventhal@sun.com zio->io_error = ECKSUM; 20812082Seschrock 208210614SJonathan.Adams@Sun.COM for (c = 0; c < rm->rm_cols; c++) { 208310614SJonathan.Adams@Sun.COM rc = &rm->rm_col[c]; 208410614SJonathan.Adams@Sun.COM if (rc->rc_error == 0) { 208510614SJonathan.Adams@Sun.COM zio_bad_cksum_t zbc; 208610614SJonathan.Adams@Sun.COM zbc.zbc_has_cksum = 0; 208710614SJonathan.Adams@Sun.COM zbc.zbc_injected = rm->rm_ecksuminjected; 208810614SJonathan.Adams@Sun.COM 208910614SJonathan.Adams@Sun.COM zfs_ereport_start_checksum( 209010105Sadam.leventhal@sun.com zio->io_spa, vd->vdev_child[rc->rc_devidx], 209110614SJonathan.Adams@Sun.COM zio, rc->rc_offset, rc->rc_size, 209210614SJonathan.Adams@Sun.COM (void *)(uintptr_t)c, &zbc); 20932082Seschrock } 20941544Seschrock } 20951544Seschrock } 2096789Sahrens 2097789Sahrens done: 2098789Sahrens zio_checksum_verified(zio); 2099789Sahrens 21008241SJeff.Bonwick@Sun.COM if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2101789Sahrens (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2102789Sahrens /* 2103789Sahrens * Use the good data we have in hand to repair damaged children. 2104789Sahrens */ 2105789Sahrens for (c = 0; c < rm->rm_cols; c++) { 2106789Sahrens rc = &rm->rm_col[c]; 21072082Seschrock cvd = vd->vdev_child[rc->rc_devidx]; 2108789Sahrens 21091732Sbonwick if (rc->rc_error == 0) 21101732Sbonwick continue; 21111732Sbonwick 21127754SJeff.Bonwick@Sun.COM zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 21131732Sbonwick rc->rc_offset, rc->rc_data, rc->rc_size, 21141732Sbonwick ZIO_TYPE_WRITE, zio->io_priority, 21158241SJeff.Bonwick@Sun.COM ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 21168241SJeff.Bonwick@Sun.COM ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 21171732Sbonwick } 2118789Sahrens } 2119789Sahrens } 2120789Sahrens 2121789Sahrens static void 2122789Sahrens vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2123789Sahrens { 21242082Seschrock if (faulted > vd->vdev_nparity) 21251544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 21261544Seschrock VDEV_AUX_NO_REPLICAS); 2127789Sahrens else if (degraded + faulted != 0) 21281544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2129789Sahrens else 21301544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2131789Sahrens } 2132789Sahrens 2133789Sahrens vdev_ops_t vdev_raidz_ops = { 2134789Sahrens vdev_raidz_open, 2135789Sahrens vdev_raidz_close, 2136789Sahrens vdev_raidz_asize, 2137789Sahrens vdev_raidz_io_start, 2138789Sahrens vdev_raidz_io_done, 2139789Sahrens vdev_raidz_state_change, 2140789Sahrens VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2141789Sahrens B_FALSE /* not a leaf vdev */ 2142789Sahrens }; 2143