1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 212082Seschrock 22789Sahrens /* 2311670SNeil.Perrin@Sun.COM * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24789Sahrens * Use is subject to license terms. 25789Sahrens */ 26789Sahrens 27789Sahrens #include <sys/zfs_context.h> 28789Sahrens #include <sys/spa.h> 29789Sahrens #include <sys/vdev_impl.h> 30789Sahrens #include <sys/zio.h> 31789Sahrens #include <sys/zio_checksum.h> 32789Sahrens #include <sys/fs/zfs.h> 331544Seschrock #include <sys/fm/fs/zfs.h> 34789Sahrens 35789Sahrens /* 36789Sahrens * Virtual device vector for RAID-Z. 372082Seschrock * 3810105Sadam.leventhal@sun.com * This vdev supports single, double, and triple parity. For single parity, 3910105Sadam.leventhal@sun.com * we use a simple XOR of all the data columns. For double or triple parity, 4010105Sadam.leventhal@sun.com * we use a special case of Reed-Solomon coding. This extends the 4110105Sadam.leventhal@sun.com * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 4210105Sadam.leventhal@sun.com * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 4310105Sadam.leventhal@sun.com * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 4410105Sadam.leventhal@sun.com * former is also based. The latter is designed to provide higher performance 4510105Sadam.leventhal@sun.com * for writes. 4610105Sadam.leventhal@sun.com * 4710105Sadam.leventhal@sun.com * Note that the Plank paper claimed to support arbitrary N+M, but was then 4810105Sadam.leventhal@sun.com * amended six years later identifying a critical flaw that invalidates its 4910105Sadam.leventhal@sun.com * claims. Nevertheless, the technique can be adapted to work for up to 5010105Sadam.leventhal@sun.com * triple parity. For additional parity, the amendment "Note: Correction to 5110105Sadam.leventhal@sun.com * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 5210105Sadam.leventhal@sun.com * is viable, but the additional complexity means that write performance will 5310105Sadam.leventhal@sun.com * suffer. 5410105Sadam.leventhal@sun.com * 5510105Sadam.leventhal@sun.com * All of the methods above operate on a Galois field, defined over the 5610105Sadam.leventhal@sun.com * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 5710105Sadam.leventhal@sun.com * can be expressed with a single byte. Briefly, the operations on the 5810105Sadam.leventhal@sun.com * field are defined as follows: 592082Seschrock * 602082Seschrock * o addition (+) is represented by a bitwise XOR 612082Seschrock * o subtraction (-) is therefore identical to addition: A + B = A - B 622082Seschrock * o multiplication of A by 2 is defined by the following bitwise expression: 632082Seschrock * (A * 2)_7 = A_6 642082Seschrock * (A * 2)_6 = A_5 652082Seschrock * (A * 2)_5 = A_4 662082Seschrock * (A * 2)_4 = A_3 + A_7 672082Seschrock * (A * 2)_3 = A_2 + A_7 682082Seschrock * (A * 2)_2 = A_1 + A_7 692082Seschrock * (A * 2)_1 = A_0 702082Seschrock * (A * 2)_0 = A_7 712082Seschrock * 722082Seschrock * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 7310105Sadam.leventhal@sun.com * As an aside, this multiplication is derived from the error correcting 7410105Sadam.leventhal@sun.com * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 752082Seschrock * 762082Seschrock * Observe that any number in the field (except for 0) can be expressed as a 772082Seschrock * power of 2 -- a generator for the field. We store a table of the powers of 782082Seschrock * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 792082Seschrock * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 8010105Sadam.leventhal@sun.com * than field addition). The inverse of a field element A (A^-1) is therefore 8110105Sadam.leventhal@sun.com * A ^ (255 - 1) = A^254. 822082Seschrock * 8310105Sadam.leventhal@sun.com * The up-to-three parity columns, P, Q, R over several data columns, 8410105Sadam.leventhal@sun.com * D_0, ... D_n-1, can be expressed by field operations: 852082Seschrock * 862082Seschrock * P = D_0 + D_1 + ... + D_n-2 + D_n-1 872082Seschrock * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 882082Seschrock * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 8910105Sadam.leventhal@sun.com * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 9010105Sadam.leventhal@sun.com * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 912082Seschrock * 9210105Sadam.leventhal@sun.com * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 9310105Sadam.leventhal@sun.com * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 9410105Sadam.leventhal@sun.com * independent coefficients. (There are no additional coefficients that have 9510105Sadam.leventhal@sun.com * this property which is why the uncorrected Plank method breaks down.) 9610105Sadam.leventhal@sun.com * 9710105Sadam.leventhal@sun.com * See the reconstruction code below for how P, Q and R can used individually 9810105Sadam.leventhal@sun.com * or in concert to recover missing data columns. 99789Sahrens */ 100789Sahrens 101789Sahrens typedef struct raidz_col { 1022082Seschrock uint64_t rc_devidx; /* child device index for I/O */ 1032082Seschrock uint64_t rc_offset; /* device offset */ 1042082Seschrock uint64_t rc_size; /* I/O size */ 1052082Seschrock void *rc_data; /* I/O data */ 10610614SJonathan.Adams@Sun.COM void *rc_gdata; /* used to store the "good" version */ 1072082Seschrock int rc_error; /* I/O error for this device */ 1082082Seschrock uint8_t rc_tried; /* Did we attempt this I/O column? */ 1092082Seschrock uint8_t rc_skipped; /* Did we skip this I/O column? */ 110789Sahrens } raidz_col_t; 111789Sahrens 112789Sahrens typedef struct raidz_map { 11310105Sadam.leventhal@sun.com uint64_t rm_cols; /* Regular column count */ 11410105Sadam.leventhal@sun.com uint64_t rm_scols; /* Count including skipped columns */ 1152082Seschrock uint64_t rm_bigcols; /* Number of oversized columns */ 1162082Seschrock uint64_t rm_asize; /* Actual total I/O size */ 1172082Seschrock uint64_t rm_missingdata; /* Count of missing data devices */ 1182082Seschrock uint64_t rm_missingparity; /* Count of missing parity devices */ 1192082Seschrock uint64_t rm_firstdatacol; /* First data column/parity count */ 12010450Sadam.leventhal@sun.com uint64_t rm_nskip; /* Skipped sectors for padding */ 12110450Sadam.leventhal@sun.com uint64_t rm_skipstart; /* Column index of padding start */ 12210614SJonathan.Adams@Sun.COM void *rm_datacopy; /* rm_asize-buffer of copied data */ 12310614SJonathan.Adams@Sun.COM uintptr_t rm_reports; /* # of referencing checksum reports */ 12410614SJonathan.Adams@Sun.COM uint8_t rm_freed; /* map no longer has referencing ZIO */ 12510614SJonathan.Adams@Sun.COM uint8_t rm_ecksuminjected; /* checksum error was injected */ 1262082Seschrock raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 127789Sahrens } raidz_map_t; 128789Sahrens 1292082Seschrock #define VDEV_RAIDZ_P 0 1302082Seschrock #define VDEV_RAIDZ_Q 1 13110105Sadam.leventhal@sun.com #define VDEV_RAIDZ_R 2 1322082Seschrock 13310105Sadam.leventhal@sun.com #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 13410105Sadam.leventhal@sun.com #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 1352082Seschrock 13610105Sadam.leventhal@sun.com /* 13710105Sadam.leventhal@sun.com * We provide a mechanism to perform the field multiplication operation on a 13810105Sadam.leventhal@sun.com * 64-bit value all at once rather than a byte at a time. This works by 13910105Sadam.leventhal@sun.com * creating a mask from the top bit in each byte and using that to 14010105Sadam.leventhal@sun.com * conditionally apply the XOR of 0x1d. 14110105Sadam.leventhal@sun.com */ 14210105Sadam.leventhal@sun.com #define VDEV_RAIDZ_64MUL_2(x, mask) \ 14310105Sadam.leventhal@sun.com { \ 14410105Sadam.leventhal@sun.com (mask) = (x) & 0x8080808080808080ULL; \ 14510105Sadam.leventhal@sun.com (mask) = ((mask) << 1) - ((mask) >> 7); \ 14610105Sadam.leventhal@sun.com (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 14710105Sadam.leventhal@sun.com ((mask) & 0x1d1d1d1d1d1d1d1d); \ 14810105Sadam.leventhal@sun.com } 14910105Sadam.leventhal@sun.com 15010105Sadam.leventhal@sun.com #define VDEV_RAIDZ_64MUL_4(x, mask) \ 15110105Sadam.leventhal@sun.com { \ 15210105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2((x), mask); \ 15310105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2((x), mask); \ 15410105Sadam.leventhal@sun.com } 15510105Sadam.leventhal@sun.com 15610105Sadam.leventhal@sun.com /* 15710105Sadam.leventhal@sun.com * Force reconstruction to use the general purpose method. 15810105Sadam.leventhal@sun.com */ 15910105Sadam.leventhal@sun.com int vdev_raidz_default_to_general; 1602082Seschrock 1612082Seschrock /* 1622082Seschrock * These two tables represent powers and logs of 2 in the Galois field defined 1632082Seschrock * above. These values were computed by repeatedly multiplying by 2 as above. 1642082Seschrock */ 1652082Seschrock static const uint8_t vdev_raidz_pow2[256] = { 1662082Seschrock 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 1672082Seschrock 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 1682082Seschrock 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 1692082Seschrock 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 1702082Seschrock 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 1712082Seschrock 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 1722082Seschrock 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 1732082Seschrock 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 1742082Seschrock 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 1752082Seschrock 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 1762082Seschrock 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 1772082Seschrock 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 1782082Seschrock 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 1792082Seschrock 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 1802082Seschrock 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 1812082Seschrock 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 1822082Seschrock 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 1832082Seschrock 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 1842082Seschrock 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 1852082Seschrock 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 1862082Seschrock 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 1872082Seschrock 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 1882082Seschrock 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 1892082Seschrock 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 1902082Seschrock 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 1912082Seschrock 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 1922082Seschrock 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 1932082Seschrock 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 1942082Seschrock 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 1952082Seschrock 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 1962082Seschrock 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 1972082Seschrock 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 1982082Seschrock }; 1992082Seschrock static const uint8_t vdev_raidz_log2[256] = { 2002082Seschrock 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 2012082Seschrock 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 2022082Seschrock 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 2032082Seschrock 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 2042082Seschrock 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 2052082Seschrock 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 2062082Seschrock 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 2072082Seschrock 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 2082082Seschrock 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 2092082Seschrock 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 2102082Seschrock 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 2112082Seschrock 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 2122082Seschrock 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 2132082Seschrock 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 2142082Seschrock 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 2152082Seschrock 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 2162082Seschrock 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 2172082Seschrock 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 2182082Seschrock 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 2192082Seschrock 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 2202082Seschrock 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 2212082Seschrock 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 2222082Seschrock 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 2232082Seschrock 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 2242082Seschrock 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 2252082Seschrock 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 2262082Seschrock 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 2272082Seschrock 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 2282082Seschrock 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 2292082Seschrock 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 2302082Seschrock 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 2312082Seschrock 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 2322082Seschrock }; 2332082Seschrock 23410614SJonathan.Adams@Sun.COM static void vdev_raidz_generate_parity(raidz_map_t *rm); 23510614SJonathan.Adams@Sun.COM 2362082Seschrock /* 2372082Seschrock * Multiply a given number by 2 raised to the given power. 2382082Seschrock */ 2392082Seschrock static uint8_t 2402082Seschrock vdev_raidz_exp2(uint_t a, int exp) 2412082Seschrock { 2422082Seschrock if (a == 0) 2432082Seschrock return (0); 2442082Seschrock 2452082Seschrock ASSERT(exp >= 0); 2462082Seschrock ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 2472082Seschrock 2482082Seschrock exp += vdev_raidz_log2[a]; 2492082Seschrock if (exp > 255) 2502082Seschrock exp -= 255; 2512082Seschrock 2522082Seschrock return (vdev_raidz_pow2[exp]); 2532082Seschrock } 2542082Seschrock 2557754SJeff.Bonwick@Sun.COM static void 25610614SJonathan.Adams@Sun.COM vdev_raidz_map_free(raidz_map_t *rm) 2577754SJeff.Bonwick@Sun.COM { 2587754SJeff.Bonwick@Sun.COM int c; 25910653SJonathan.Adams@Sun.COM size_t size; 2607754SJeff.Bonwick@Sun.COM 26110614SJonathan.Adams@Sun.COM for (c = 0; c < rm->rm_firstdatacol; c++) { 2627754SJeff.Bonwick@Sun.COM zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 2637754SJeff.Bonwick@Sun.COM 26410614SJonathan.Adams@Sun.COM if (rm->rm_col[c].rc_gdata != NULL) 26510614SJonathan.Adams@Sun.COM zio_buf_free(rm->rm_col[c].rc_gdata, 26610614SJonathan.Adams@Sun.COM rm->rm_col[c].rc_size); 26710614SJonathan.Adams@Sun.COM } 26810614SJonathan.Adams@Sun.COM 26910653SJonathan.Adams@Sun.COM size = 0; 27010653SJonathan.Adams@Sun.COM for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 27110653SJonathan.Adams@Sun.COM size += rm->rm_col[c].rc_size; 27210653SJonathan.Adams@Sun.COM 27310614SJonathan.Adams@Sun.COM if (rm->rm_datacopy != NULL) 27410614SJonathan.Adams@Sun.COM zio_buf_free(rm->rm_datacopy, size); 27510614SJonathan.Adams@Sun.COM 27610105Sadam.leventhal@sun.com kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 2777754SJeff.Bonwick@Sun.COM } 2787754SJeff.Bonwick@Sun.COM 27910614SJonathan.Adams@Sun.COM static void 28010614SJonathan.Adams@Sun.COM vdev_raidz_map_free_vsd(zio_t *zio) 28110614SJonathan.Adams@Sun.COM { 28210614SJonathan.Adams@Sun.COM raidz_map_t *rm = zio->io_vsd; 28310614SJonathan.Adams@Sun.COM 28410614SJonathan.Adams@Sun.COM ASSERT3U(rm->rm_freed, ==, 0); 28510614SJonathan.Adams@Sun.COM rm->rm_freed = 1; 28610614SJonathan.Adams@Sun.COM 28710614SJonathan.Adams@Sun.COM if (rm->rm_reports == 0) 28810614SJonathan.Adams@Sun.COM vdev_raidz_map_free(rm); 28910614SJonathan.Adams@Sun.COM } 29010614SJonathan.Adams@Sun.COM 29110614SJonathan.Adams@Sun.COM /*ARGSUSED*/ 29210614SJonathan.Adams@Sun.COM static void 29310614SJonathan.Adams@Sun.COM vdev_raidz_cksum_free(void *arg, size_t ignored) 29410614SJonathan.Adams@Sun.COM { 29510614SJonathan.Adams@Sun.COM raidz_map_t *rm = arg; 29610614SJonathan.Adams@Sun.COM 29710614SJonathan.Adams@Sun.COM ASSERT3U(rm->rm_reports, >, 0); 29810614SJonathan.Adams@Sun.COM 29910653SJonathan.Adams@Sun.COM if (--rm->rm_reports == 0 && rm->rm_freed != 0) 30010614SJonathan.Adams@Sun.COM vdev_raidz_map_free(rm); 30110614SJonathan.Adams@Sun.COM } 30210614SJonathan.Adams@Sun.COM 30310614SJonathan.Adams@Sun.COM static void 30410614SJonathan.Adams@Sun.COM vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) 30510614SJonathan.Adams@Sun.COM { 30610614SJonathan.Adams@Sun.COM raidz_map_t *rm = zcr->zcr_cbdata; 30710614SJonathan.Adams@Sun.COM size_t c = zcr->zcr_cbinfo; 30810614SJonathan.Adams@Sun.COM size_t x; 30910614SJonathan.Adams@Sun.COM 31010614SJonathan.Adams@Sun.COM const char *good = NULL; 31110614SJonathan.Adams@Sun.COM const char *bad = rm->rm_col[c].rc_data; 31210614SJonathan.Adams@Sun.COM 31310614SJonathan.Adams@Sun.COM if (good_data == NULL) { 31410614SJonathan.Adams@Sun.COM zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 31510614SJonathan.Adams@Sun.COM return; 31610614SJonathan.Adams@Sun.COM } 31710614SJonathan.Adams@Sun.COM 31810614SJonathan.Adams@Sun.COM if (c < rm->rm_firstdatacol) { 31910614SJonathan.Adams@Sun.COM /* 32010614SJonathan.Adams@Sun.COM * The first time through, calculate the parity blocks for 32110614SJonathan.Adams@Sun.COM * the good data (this relies on the fact that the good 32210614SJonathan.Adams@Sun.COM * data never changes for a given logical ZIO) 32310614SJonathan.Adams@Sun.COM */ 32410614SJonathan.Adams@Sun.COM if (rm->rm_col[0].rc_gdata == NULL) { 32510614SJonathan.Adams@Sun.COM char *bad_parity[VDEV_RAIDZ_MAXPARITY]; 32610614SJonathan.Adams@Sun.COM char *buf; 32710614SJonathan.Adams@Sun.COM 32810614SJonathan.Adams@Sun.COM /* 32910614SJonathan.Adams@Sun.COM * Set up the rm_col[]s to generate the parity for 33010614SJonathan.Adams@Sun.COM * good_data, first saving the parity bufs and 33110614SJonathan.Adams@Sun.COM * replacing them with buffers to hold the result. 33210614SJonathan.Adams@Sun.COM */ 33310614SJonathan.Adams@Sun.COM for (x = 0; x < rm->rm_firstdatacol; x++) { 33410614SJonathan.Adams@Sun.COM bad_parity[x] = rm->rm_col[x].rc_data; 33510614SJonathan.Adams@Sun.COM rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = 33610614SJonathan.Adams@Sun.COM zio_buf_alloc(rm->rm_col[x].rc_size); 33710614SJonathan.Adams@Sun.COM } 33810614SJonathan.Adams@Sun.COM 33910614SJonathan.Adams@Sun.COM /* fill in the data columns from good_data */ 34010614SJonathan.Adams@Sun.COM buf = (char *)good_data; 34110614SJonathan.Adams@Sun.COM for (; x < rm->rm_cols; x++) { 34210614SJonathan.Adams@Sun.COM rm->rm_col[x].rc_data = buf; 34310614SJonathan.Adams@Sun.COM buf += rm->rm_col[x].rc_size; 34410614SJonathan.Adams@Sun.COM } 34510614SJonathan.Adams@Sun.COM 34610614SJonathan.Adams@Sun.COM /* 34710614SJonathan.Adams@Sun.COM * Construct the parity from the good data. 34810614SJonathan.Adams@Sun.COM */ 34910614SJonathan.Adams@Sun.COM vdev_raidz_generate_parity(rm); 35010614SJonathan.Adams@Sun.COM 35110614SJonathan.Adams@Sun.COM /* restore everything back to its original state */ 35210614SJonathan.Adams@Sun.COM for (x = 0; x < rm->rm_firstdatacol; x++) 35310614SJonathan.Adams@Sun.COM rm->rm_col[x].rc_data = bad_parity[x]; 35410614SJonathan.Adams@Sun.COM 35510614SJonathan.Adams@Sun.COM buf = rm->rm_datacopy; 35610614SJonathan.Adams@Sun.COM for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 35710614SJonathan.Adams@Sun.COM rm->rm_col[x].rc_data = buf; 35810614SJonathan.Adams@Sun.COM buf += rm->rm_col[x].rc_size; 35910614SJonathan.Adams@Sun.COM } 36010614SJonathan.Adams@Sun.COM } 36110614SJonathan.Adams@Sun.COM 36210614SJonathan.Adams@Sun.COM ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 36310614SJonathan.Adams@Sun.COM good = rm->rm_col[c].rc_gdata; 36410614SJonathan.Adams@Sun.COM } else { 36510614SJonathan.Adams@Sun.COM /* adjust good_data to point at the start of our column */ 36610614SJonathan.Adams@Sun.COM good = good_data; 36710614SJonathan.Adams@Sun.COM 36810614SJonathan.Adams@Sun.COM for (x = rm->rm_firstdatacol; x < c; x++) 36910614SJonathan.Adams@Sun.COM good += rm->rm_col[x].rc_size; 37010614SJonathan.Adams@Sun.COM } 37110614SJonathan.Adams@Sun.COM 37210614SJonathan.Adams@Sun.COM /* we drop the ereport if it ends up that the data was good */ 37310614SJonathan.Adams@Sun.COM zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 37410614SJonathan.Adams@Sun.COM } 37510614SJonathan.Adams@Sun.COM 37610614SJonathan.Adams@Sun.COM /* 37710614SJonathan.Adams@Sun.COM * Invoked indirectly by zfs_ereport_start_checksum(), called 37810614SJonathan.Adams@Sun.COM * below when our read operation fails completely. The main point 37910614SJonathan.Adams@Sun.COM * is to keep a copy of everything we read from disk, so that at 38010614SJonathan.Adams@Sun.COM * vdev_raidz_cksum_finish() time we can compare it with the good data. 38110614SJonathan.Adams@Sun.COM */ 38210614SJonathan.Adams@Sun.COM static void 38310614SJonathan.Adams@Sun.COM vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 38410614SJonathan.Adams@Sun.COM { 38510614SJonathan.Adams@Sun.COM size_t c = (size_t)(uintptr_t)arg; 38610614SJonathan.Adams@Sun.COM caddr_t buf; 38710614SJonathan.Adams@Sun.COM 38810614SJonathan.Adams@Sun.COM raidz_map_t *rm = zio->io_vsd; 38910614SJonathan.Adams@Sun.COM size_t size; 39010614SJonathan.Adams@Sun.COM 39110614SJonathan.Adams@Sun.COM /* set up the report and bump the refcount */ 39210614SJonathan.Adams@Sun.COM zcr->zcr_cbdata = rm; 39310614SJonathan.Adams@Sun.COM zcr->zcr_cbinfo = c; 39410614SJonathan.Adams@Sun.COM zcr->zcr_finish = vdev_raidz_cksum_finish; 39510614SJonathan.Adams@Sun.COM zcr->zcr_free = vdev_raidz_cksum_free; 39610614SJonathan.Adams@Sun.COM 39710614SJonathan.Adams@Sun.COM rm->rm_reports++; 39810614SJonathan.Adams@Sun.COM ASSERT3U(rm->rm_reports, >, 0); 39910614SJonathan.Adams@Sun.COM 40010653SJonathan.Adams@Sun.COM if (rm->rm_datacopy != NULL) 40110614SJonathan.Adams@Sun.COM return; 40210614SJonathan.Adams@Sun.COM 40310614SJonathan.Adams@Sun.COM /* 40410653SJonathan.Adams@Sun.COM * It's the first time we're called for this raidz_map_t, so we need 40510653SJonathan.Adams@Sun.COM * to copy the data aside; there's no guarantee that our zio's buffer 40610653SJonathan.Adams@Sun.COM * won't be re-used for something else. 40710614SJonathan.Adams@Sun.COM * 40810653SJonathan.Adams@Sun.COM * Our parity data is already in separate buffers, so there's no need 40910614SJonathan.Adams@Sun.COM * to copy them. 41010614SJonathan.Adams@Sun.COM */ 41110614SJonathan.Adams@Sun.COM 41210653SJonathan.Adams@Sun.COM size = 0; 41310653SJonathan.Adams@Sun.COM for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 41410653SJonathan.Adams@Sun.COM size += rm->rm_col[c].rc_size; 41510614SJonathan.Adams@Sun.COM 41610614SJonathan.Adams@Sun.COM buf = rm->rm_datacopy = zio_buf_alloc(size); 41710653SJonathan.Adams@Sun.COM 41810653SJonathan.Adams@Sun.COM for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 41910614SJonathan.Adams@Sun.COM raidz_col_t *col = &rm->rm_col[c]; 42010614SJonathan.Adams@Sun.COM 42110614SJonathan.Adams@Sun.COM bcopy(col->rc_data, buf, col->rc_size); 42210614SJonathan.Adams@Sun.COM col->rc_data = buf; 42310614SJonathan.Adams@Sun.COM 42410614SJonathan.Adams@Sun.COM buf += col->rc_size; 42510614SJonathan.Adams@Sun.COM } 42610614SJonathan.Adams@Sun.COM ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); 42710614SJonathan.Adams@Sun.COM } 42810614SJonathan.Adams@Sun.COM 42910614SJonathan.Adams@Sun.COM static const zio_vsd_ops_t vdev_raidz_vsd_ops = { 43010614SJonathan.Adams@Sun.COM vdev_raidz_map_free_vsd, 43110614SJonathan.Adams@Sun.COM vdev_raidz_cksum_report 43210614SJonathan.Adams@Sun.COM }; 43310614SJonathan.Adams@Sun.COM 434789Sahrens static raidz_map_t * 4352082Seschrock vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, 4362082Seschrock uint64_t nparity) 437789Sahrens { 438789Sahrens raidz_map_t *rm; 439789Sahrens uint64_t b = zio->io_offset >> unit_shift; 440789Sahrens uint64_t s = zio->io_size >> unit_shift; 441789Sahrens uint64_t f = b % dcols; 442789Sahrens uint64_t o = (b / dcols) << unit_shift; 44310105Sadam.leventhal@sun.com uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 444789Sahrens 4452082Seschrock q = s / (dcols - nparity); 4462082Seschrock r = s - q * (dcols - nparity); 4472082Seschrock bc = (r == 0 ? 0 : r + nparity); 44810105Sadam.leventhal@sun.com tot = s + nparity * (q + (r == 0 ? 0 : 1)); 449789Sahrens 45010105Sadam.leventhal@sun.com if (q == 0) { 45110105Sadam.leventhal@sun.com acols = bc; 45210105Sadam.leventhal@sun.com scols = MIN(dcols, roundup(bc, nparity + 1)); 45310105Sadam.leventhal@sun.com } else { 45410105Sadam.leventhal@sun.com acols = dcols; 45510105Sadam.leventhal@sun.com scols = dcols; 45610105Sadam.leventhal@sun.com } 457789Sahrens 45810105Sadam.leventhal@sun.com ASSERT3U(acols, <=, scols); 45910105Sadam.leventhal@sun.com 46010105Sadam.leventhal@sun.com rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 461789Sahrens 462789Sahrens rm->rm_cols = acols; 46310105Sadam.leventhal@sun.com rm->rm_scols = scols; 464789Sahrens rm->rm_bigcols = bc; 46510450Sadam.leventhal@sun.com rm->rm_skipstart = bc; 4662082Seschrock rm->rm_missingdata = 0; 4672082Seschrock rm->rm_missingparity = 0; 4682082Seschrock rm->rm_firstdatacol = nparity; 46910614SJonathan.Adams@Sun.COM rm->rm_datacopy = NULL; 47010614SJonathan.Adams@Sun.COM rm->rm_reports = 0; 47110614SJonathan.Adams@Sun.COM rm->rm_freed = 0; 47210614SJonathan.Adams@Sun.COM rm->rm_ecksuminjected = 0; 473789Sahrens 47410105Sadam.leventhal@sun.com asize = 0; 47510105Sadam.leventhal@sun.com 47610105Sadam.leventhal@sun.com for (c = 0; c < scols; c++) { 477789Sahrens col = f + c; 478789Sahrens coff = o; 479789Sahrens if (col >= dcols) { 480789Sahrens col -= dcols; 481789Sahrens coff += 1ULL << unit_shift; 482789Sahrens } 4832082Seschrock rm->rm_col[c].rc_devidx = col; 484789Sahrens rm->rm_col[c].rc_offset = coff; 485789Sahrens rm->rm_col[c].rc_data = NULL; 48610614SJonathan.Adams@Sun.COM rm->rm_col[c].rc_gdata = NULL; 487789Sahrens rm->rm_col[c].rc_error = 0; 488789Sahrens rm->rm_col[c].rc_tried = 0; 489789Sahrens rm->rm_col[c].rc_skipped = 0; 49010105Sadam.leventhal@sun.com 49110105Sadam.leventhal@sun.com if (c >= acols) 49210105Sadam.leventhal@sun.com rm->rm_col[c].rc_size = 0; 49310105Sadam.leventhal@sun.com else if (c < bc) 49410105Sadam.leventhal@sun.com rm->rm_col[c].rc_size = (q + 1) << unit_shift; 49510105Sadam.leventhal@sun.com else 49610105Sadam.leventhal@sun.com rm->rm_col[c].rc_size = q << unit_shift; 49710105Sadam.leventhal@sun.com 49810105Sadam.leventhal@sun.com asize += rm->rm_col[c].rc_size; 499789Sahrens } 500789Sahrens 50110105Sadam.leventhal@sun.com ASSERT3U(asize, ==, tot << unit_shift); 50210105Sadam.leventhal@sun.com rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 50310450Sadam.leventhal@sun.com rm->rm_nskip = roundup(tot, nparity + 1) - tot; 50410450Sadam.leventhal@sun.com ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 50510450Sadam.leventhal@sun.com ASSERT3U(rm->rm_nskip, <=, nparity); 506789Sahrens 507789Sahrens for (c = 0; c < rm->rm_firstdatacol; c++) 508789Sahrens rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); 509789Sahrens 510789Sahrens rm->rm_col[c].rc_data = zio->io_data; 511789Sahrens 512789Sahrens for (c = c + 1; c < acols; c++) 513789Sahrens rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 514789Sahrens rm->rm_col[c - 1].rc_size; 515789Sahrens 5161133Seschrock /* 5172082Seschrock * If all data stored spans all columns, there's a danger that parity 5182082Seschrock * will always be on the same device and, since parity isn't read 5192082Seschrock * during normal operation, that that device's I/O bandwidth won't be 5202082Seschrock * used effectively. We therefore switch the parity every 1MB. 5212082Seschrock * 5222082Seschrock * ... at least that was, ostensibly, the theory. As a practical 5232082Seschrock * matter unless we juggle the parity between all devices evenly, we 5242082Seschrock * won't see any benefit. Further, occasional writes that aren't a 5252082Seschrock * multiple of the LCM of the number of children and the minimum 5262082Seschrock * stripe width are sufficient to avoid pessimal behavior. 5272082Seschrock * Unfortunately, this decision created an implicit on-disk format 5283456Sahl * requirement that we need to support for all eternity, but only 5293456Sahl * for single-parity RAID-Z. 53010450Sadam.leventhal@sun.com * 53110450Sadam.leventhal@sun.com * If we intend to skip a sector in the zeroth column for padding 53210450Sadam.leventhal@sun.com * we must make sure to note this swap. We will never intend to 53310450Sadam.leventhal@sun.com * skip the first column since at least one data and one parity 53410450Sadam.leventhal@sun.com * column must appear in each row. 5351133Seschrock */ 5361133Seschrock ASSERT(rm->rm_cols >= 2); 5371133Seschrock ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 538789Sahrens 5392082Seschrock if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 5402082Seschrock devidx = rm->rm_col[0].rc_devidx; 5411133Seschrock o = rm->rm_col[0].rc_offset; 5422082Seschrock rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 5431133Seschrock rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 5442082Seschrock rm->rm_col[1].rc_devidx = devidx; 5451133Seschrock rm->rm_col[1].rc_offset = o; 54610450Sadam.leventhal@sun.com 54710450Sadam.leventhal@sun.com if (rm->rm_skipstart == 0) 54810450Sadam.leventhal@sun.com rm->rm_skipstart = 1; 549789Sahrens } 550789Sahrens 551789Sahrens zio->io_vsd = rm; 55210614SJonathan.Adams@Sun.COM zio->io_vsd_ops = &vdev_raidz_vsd_ops; 553789Sahrens return (rm); 554789Sahrens } 555789Sahrens 556789Sahrens static void 5572082Seschrock vdev_raidz_generate_parity_p(raidz_map_t *rm) 5582082Seschrock { 5592082Seschrock uint64_t *p, *src, pcount, ccount, i; 5602082Seschrock int c; 5612082Seschrock 5622082Seschrock pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 5632082Seschrock 5642082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 5652082Seschrock src = rm->rm_col[c].rc_data; 5662082Seschrock p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 5672082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 5682082Seschrock 5692082Seschrock if (c == rm->rm_firstdatacol) { 5702082Seschrock ASSERT(ccount == pcount); 57110105Sadam.leventhal@sun.com for (i = 0; i < ccount; i++, src++, p++) { 5722082Seschrock *p = *src; 5732082Seschrock } 5742082Seschrock } else { 5752082Seschrock ASSERT(ccount <= pcount); 57610105Sadam.leventhal@sun.com for (i = 0; i < ccount; i++, src++, p++) { 5772082Seschrock *p ^= *src; 5782082Seschrock } 5792082Seschrock } 5802082Seschrock } 5812082Seschrock } 5822082Seschrock 5832082Seschrock static void 5842082Seschrock vdev_raidz_generate_parity_pq(raidz_map_t *rm) 585789Sahrens { 58610105Sadam.leventhal@sun.com uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 5872082Seschrock int c; 5882082Seschrock 58910105Sadam.leventhal@sun.com pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 5902082Seschrock ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 5912082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_size); 5922082Seschrock 5932082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 5942082Seschrock src = rm->rm_col[c].rc_data; 5952082Seschrock p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 5962082Seschrock q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 59710105Sadam.leventhal@sun.com 59810105Sadam.leventhal@sun.com ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 5992082Seschrock 6002082Seschrock if (c == rm->rm_firstdatacol) { 60110105Sadam.leventhal@sun.com ASSERT(ccnt == pcnt || ccnt == 0); 60210105Sadam.leventhal@sun.com for (i = 0; i < ccnt; i++, src++, p++, q++) { 6032082Seschrock *p = *src; 60410105Sadam.leventhal@sun.com *q = *src; 6052082Seschrock } 60610105Sadam.leventhal@sun.com for (; i < pcnt; i++, src++, p++, q++) { 60710105Sadam.leventhal@sun.com *p = 0; 6082082Seschrock *q = 0; 6092082Seschrock } 6102082Seschrock } else { 61110105Sadam.leventhal@sun.com ASSERT(ccnt <= pcnt); 612789Sahrens 6132082Seschrock /* 61410105Sadam.leventhal@sun.com * Apply the algorithm described above by multiplying 61510105Sadam.leventhal@sun.com * the previous result and adding in the new value. 6162082Seschrock */ 61710105Sadam.leventhal@sun.com for (i = 0; i < ccnt; i++, src++, p++, q++) { 61810105Sadam.leventhal@sun.com *p ^= *src; 61910105Sadam.leventhal@sun.com 62010105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*q, mask); 6212082Seschrock *q ^= *src; 6222082Seschrock } 6232082Seschrock 6242082Seschrock /* 6252082Seschrock * Treat short columns as though they are full of 0s. 62610105Sadam.leventhal@sun.com * Note that there's therefore nothing needed for P. 6272082Seschrock */ 62810105Sadam.leventhal@sun.com for (; i < pcnt; i++, q++) { 62910105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*q, mask); 6302082Seschrock } 6312082Seschrock } 6322082Seschrock } 6332082Seschrock } 6342082Seschrock 6352082Seschrock static void 63610105Sadam.leventhal@sun.com vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 63710105Sadam.leventhal@sun.com { 63810105Sadam.leventhal@sun.com uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 63910105Sadam.leventhal@sun.com int c; 64010105Sadam.leventhal@sun.com 64110105Sadam.leventhal@sun.com pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 64210105Sadam.leventhal@sun.com ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 64310105Sadam.leventhal@sun.com rm->rm_col[VDEV_RAIDZ_Q].rc_size); 64410105Sadam.leventhal@sun.com ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 64510105Sadam.leventhal@sun.com rm->rm_col[VDEV_RAIDZ_R].rc_size); 64610105Sadam.leventhal@sun.com 64710105Sadam.leventhal@sun.com for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 64810105Sadam.leventhal@sun.com src = rm->rm_col[c].rc_data; 64910105Sadam.leventhal@sun.com p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 65010105Sadam.leventhal@sun.com q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 65110105Sadam.leventhal@sun.com r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 65210105Sadam.leventhal@sun.com 65310105Sadam.leventhal@sun.com ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 65410105Sadam.leventhal@sun.com 65510105Sadam.leventhal@sun.com if (c == rm->rm_firstdatacol) { 65610105Sadam.leventhal@sun.com ASSERT(ccnt == pcnt || ccnt == 0); 65710105Sadam.leventhal@sun.com for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 65810105Sadam.leventhal@sun.com *p = *src; 65910105Sadam.leventhal@sun.com *q = *src; 66010105Sadam.leventhal@sun.com *r = *src; 66110105Sadam.leventhal@sun.com } 66210105Sadam.leventhal@sun.com for (; i < pcnt; i++, src++, p++, q++, r++) { 66310105Sadam.leventhal@sun.com *p = 0; 66410105Sadam.leventhal@sun.com *q = 0; 66510105Sadam.leventhal@sun.com *r = 0; 66610105Sadam.leventhal@sun.com } 66710105Sadam.leventhal@sun.com } else { 66810105Sadam.leventhal@sun.com ASSERT(ccnt <= pcnt); 66910105Sadam.leventhal@sun.com 67010105Sadam.leventhal@sun.com /* 67110105Sadam.leventhal@sun.com * Apply the algorithm described above by multiplying 67210105Sadam.leventhal@sun.com * the previous result and adding in the new value. 67310105Sadam.leventhal@sun.com */ 67410105Sadam.leventhal@sun.com for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 67510105Sadam.leventhal@sun.com *p ^= *src; 67610105Sadam.leventhal@sun.com 67710105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*q, mask); 67810105Sadam.leventhal@sun.com *q ^= *src; 67910105Sadam.leventhal@sun.com 68010105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_4(*r, mask); 68110105Sadam.leventhal@sun.com *r ^= *src; 68210105Sadam.leventhal@sun.com } 68310105Sadam.leventhal@sun.com 68410105Sadam.leventhal@sun.com /* 68510105Sadam.leventhal@sun.com * Treat short columns as though they are full of 0s. 68610105Sadam.leventhal@sun.com * Note that there's therefore nothing needed for P. 68710105Sadam.leventhal@sun.com */ 68810105Sadam.leventhal@sun.com for (; i < pcnt; i++, q++, r++) { 68910105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*q, mask); 69010105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_4(*r, mask); 69110105Sadam.leventhal@sun.com } 69210105Sadam.leventhal@sun.com } 69310105Sadam.leventhal@sun.com } 69410105Sadam.leventhal@sun.com } 69510105Sadam.leventhal@sun.com 69610105Sadam.leventhal@sun.com /* 69710105Sadam.leventhal@sun.com * Generate RAID parity in the first virtual columns according to the number of 69810105Sadam.leventhal@sun.com * parity columns available. 69910105Sadam.leventhal@sun.com */ 70010105Sadam.leventhal@sun.com static void 70110105Sadam.leventhal@sun.com vdev_raidz_generate_parity(raidz_map_t *rm) 70210105Sadam.leventhal@sun.com { 70310105Sadam.leventhal@sun.com switch (rm->rm_firstdatacol) { 70410105Sadam.leventhal@sun.com case 1: 70510105Sadam.leventhal@sun.com vdev_raidz_generate_parity_p(rm); 70610105Sadam.leventhal@sun.com break; 70710105Sadam.leventhal@sun.com case 2: 70810105Sadam.leventhal@sun.com vdev_raidz_generate_parity_pq(rm); 70910105Sadam.leventhal@sun.com break; 71010105Sadam.leventhal@sun.com case 3: 71110105Sadam.leventhal@sun.com vdev_raidz_generate_parity_pqr(rm); 71210105Sadam.leventhal@sun.com break; 71310105Sadam.leventhal@sun.com default: 71410105Sadam.leventhal@sun.com cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 71510105Sadam.leventhal@sun.com } 71610105Sadam.leventhal@sun.com } 71710105Sadam.leventhal@sun.com 71810105Sadam.leventhal@sun.com static int 71910105Sadam.leventhal@sun.com vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 7202082Seschrock { 7212082Seschrock uint64_t *dst, *src, xcount, ccount, count, i; 72210105Sadam.leventhal@sun.com int x = tgts[0]; 7232082Seschrock int c; 7242082Seschrock 72510105Sadam.leventhal@sun.com ASSERT(ntgts == 1); 72610105Sadam.leventhal@sun.com ASSERT(x >= rm->rm_firstdatacol); 72710105Sadam.leventhal@sun.com ASSERT(x < rm->rm_cols); 72810105Sadam.leventhal@sun.com 7292082Seschrock xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 7302082Seschrock ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 7312082Seschrock ASSERT(xcount > 0); 7322082Seschrock 7332082Seschrock src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 7342082Seschrock dst = rm->rm_col[x].rc_data; 7352082Seschrock for (i = 0; i < xcount; i++, dst++, src++) { 7362082Seschrock *dst = *src; 7372082Seschrock } 7382082Seschrock 7392082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 740789Sahrens src = rm->rm_col[c].rc_data; 741789Sahrens dst = rm->rm_col[x].rc_data; 7422082Seschrock 7432082Seschrock if (c == x) 7442082Seschrock continue; 7452082Seschrock 7462082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 7472082Seschrock count = MIN(ccount, xcount); 7482082Seschrock 7492082Seschrock for (i = 0; i < count; i++, dst++, src++) { 7502082Seschrock *dst ^= *src; 751789Sahrens } 752789Sahrens } 75310105Sadam.leventhal@sun.com 75410105Sadam.leventhal@sun.com return (1 << VDEV_RAIDZ_P); 755789Sahrens } 756789Sahrens 75710105Sadam.leventhal@sun.com static int 75810105Sadam.leventhal@sun.com vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 7592082Seschrock { 7602082Seschrock uint64_t *dst, *src, xcount, ccount, count, mask, i; 7612082Seschrock uint8_t *b; 76210105Sadam.leventhal@sun.com int x = tgts[0]; 7632082Seschrock int c, j, exp; 7642082Seschrock 76510105Sadam.leventhal@sun.com ASSERT(ntgts == 1); 76610105Sadam.leventhal@sun.com 7672082Seschrock xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 7682082Seschrock ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 7692082Seschrock 7702082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 7712082Seschrock src = rm->rm_col[c].rc_data; 7722082Seschrock dst = rm->rm_col[x].rc_data; 7732082Seschrock 7742082Seschrock if (c == x) 7752082Seschrock ccount = 0; 7762082Seschrock else 7772082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 7782082Seschrock 7792082Seschrock count = MIN(ccount, xcount); 7802082Seschrock 7812082Seschrock if (c == rm->rm_firstdatacol) { 7822082Seschrock for (i = 0; i < count; i++, dst++, src++) { 7832082Seschrock *dst = *src; 7842082Seschrock } 7852082Seschrock for (; i < xcount; i++, dst++) { 7862082Seschrock *dst = 0; 7872082Seschrock } 7882082Seschrock 7892082Seschrock } else { 7902082Seschrock for (i = 0; i < count; i++, dst++, src++) { 79110105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*dst, mask); 7922082Seschrock *dst ^= *src; 7932082Seschrock } 7942082Seschrock 7952082Seschrock for (; i < xcount; i++, dst++) { 79610105Sadam.leventhal@sun.com VDEV_RAIDZ_64MUL_2(*dst, mask); 7972082Seschrock } 7982082Seschrock } 7992082Seschrock } 8002082Seschrock 8012082Seschrock src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 8022082Seschrock dst = rm->rm_col[x].rc_data; 8032082Seschrock exp = 255 - (rm->rm_cols - 1 - x); 8042082Seschrock 8052082Seschrock for (i = 0; i < xcount; i++, dst++, src++) { 8062082Seschrock *dst ^= *src; 8072082Seschrock for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 8082082Seschrock *b = vdev_raidz_exp2(*b, exp); 8092082Seschrock } 8102082Seschrock } 81110105Sadam.leventhal@sun.com 81210105Sadam.leventhal@sun.com return (1 << VDEV_RAIDZ_Q); 8132082Seschrock } 8142082Seschrock 81510105Sadam.leventhal@sun.com static int 81610105Sadam.leventhal@sun.com vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 8172082Seschrock { 8182082Seschrock uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 8192082Seschrock void *pdata, *qdata; 8202082Seschrock uint64_t xsize, ysize, i; 82110105Sadam.leventhal@sun.com int x = tgts[0]; 82210105Sadam.leventhal@sun.com int y = tgts[1]; 8232082Seschrock 82410105Sadam.leventhal@sun.com ASSERT(ntgts == 2); 8252082Seschrock ASSERT(x < y); 8262082Seschrock ASSERT(x >= rm->rm_firstdatacol); 8272082Seschrock ASSERT(y < rm->rm_cols); 8282082Seschrock 8292082Seschrock ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 8302082Seschrock 8312082Seschrock /* 8322082Seschrock * Move the parity data aside -- we're going to compute parity as 8332082Seschrock * though columns x and y were full of zeros -- Pxy and Qxy. We want to 8342082Seschrock * reuse the parity generation mechanism without trashing the actual 8352082Seschrock * parity so we make those columns appear to be full of zeros by 8362082Seschrock * setting their lengths to zero. 8372082Seschrock */ 8382082Seschrock pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 8392082Seschrock qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 8402082Seschrock xsize = rm->rm_col[x].rc_size; 8412082Seschrock ysize = rm->rm_col[y].rc_size; 8422082Seschrock 8432082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_data = 8442082Seschrock zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 8452082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_data = 8462082Seschrock zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 8472082Seschrock rm->rm_col[x].rc_size = 0; 8482082Seschrock rm->rm_col[y].rc_size = 0; 8492082Seschrock 8502082Seschrock vdev_raidz_generate_parity_pq(rm); 8512082Seschrock 8522082Seschrock rm->rm_col[x].rc_size = xsize; 8532082Seschrock rm->rm_col[y].rc_size = ysize; 8542082Seschrock 8552082Seschrock p = pdata; 8562082Seschrock q = qdata; 8572082Seschrock pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 8582082Seschrock qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 8592082Seschrock xd = rm->rm_col[x].rc_data; 8602082Seschrock yd = rm->rm_col[y].rc_data; 8612082Seschrock 8622082Seschrock /* 8632082Seschrock * We now have: 8642082Seschrock * Pxy = P + D_x + D_y 8652082Seschrock * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 8662082Seschrock * 8672082Seschrock * We can then solve for D_x: 8682082Seschrock * D_x = A * (P + Pxy) + B * (Q + Qxy) 8692082Seschrock * where 8702082Seschrock * A = 2^(x - y) * (2^(x - y) + 1)^-1 8712082Seschrock * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 8722082Seschrock * 8732082Seschrock * With D_x in hand, we can easily solve for D_y: 8742082Seschrock * D_y = P + Pxy + D_x 8752082Seschrock */ 8762082Seschrock 8772082Seschrock a = vdev_raidz_pow2[255 + x - y]; 8782082Seschrock b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 8792082Seschrock tmp = 255 - vdev_raidz_log2[a ^ 1]; 8802082Seschrock 8812082Seschrock aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 8822082Seschrock bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 8832082Seschrock 8842082Seschrock for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 8852082Seschrock *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 8862082Seschrock vdev_raidz_exp2(*q ^ *qxy, bexp); 8872082Seschrock 8882082Seschrock if (i < ysize) 8892082Seschrock *yd = *p ^ *pxy ^ *xd; 8902082Seschrock } 8912082Seschrock 8922082Seschrock zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 8932082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_size); 8942082Seschrock zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 8952082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_size); 8962082Seschrock 8972082Seschrock /* 8982082Seschrock * Restore the saved parity data. 8992082Seschrock */ 9002082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 9012082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 90210105Sadam.leventhal@sun.com 90310105Sadam.leventhal@sun.com return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 90410105Sadam.leventhal@sun.com } 90510105Sadam.leventhal@sun.com 90610105Sadam.leventhal@sun.com /* BEGIN CSTYLED */ 90710105Sadam.leventhal@sun.com /* 90810105Sadam.leventhal@sun.com * In the general case of reconstruction, we must solve the system of linear 90910105Sadam.leventhal@sun.com * equations defined by the coeffecients used to generate parity as well as 91010105Sadam.leventhal@sun.com * the contents of the data and parity disks. This can be expressed with 91110105Sadam.leventhal@sun.com * vectors for the original data (D) and the actual data (d) and parity (p) 91210105Sadam.leventhal@sun.com * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 91310105Sadam.leventhal@sun.com * 91410105Sadam.leventhal@sun.com * __ __ __ __ 91510105Sadam.leventhal@sun.com * | | __ __ | p_0 | 91610105Sadam.leventhal@sun.com * | V | | D_0 | | p_m-1 | 91710105Sadam.leventhal@sun.com * | | x | : | = | d_0 | 91810105Sadam.leventhal@sun.com * | I | | D_n-1 | | : | 91910105Sadam.leventhal@sun.com * | | ~~ ~~ | d_n-1 | 92010105Sadam.leventhal@sun.com * ~~ ~~ ~~ ~~ 92110105Sadam.leventhal@sun.com * 92210105Sadam.leventhal@sun.com * I is simply a square identity matrix of size n, and V is a vandermonde 92310105Sadam.leventhal@sun.com * matrix defined by the coeffecients we chose for the various parity columns 92410105Sadam.leventhal@sun.com * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 92510105Sadam.leventhal@sun.com * computation as well as linear separability. 92610105Sadam.leventhal@sun.com * 92710105Sadam.leventhal@sun.com * __ __ __ __ 92810105Sadam.leventhal@sun.com * | 1 .. 1 1 1 | | p_0 | 92910105Sadam.leventhal@sun.com * | 2^n-1 .. 4 2 1 | __ __ | : | 93010105Sadam.leventhal@sun.com * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 93110105Sadam.leventhal@sun.com * | 1 .. 0 0 0 | | D_1 | | d_0 | 93210105Sadam.leventhal@sun.com * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 93310105Sadam.leventhal@sun.com * | : : : : | | : | | d_2 | 93410105Sadam.leventhal@sun.com * | 0 .. 1 0 0 | | D_n-1 | | : | 93510105Sadam.leventhal@sun.com * | 0 .. 0 1 0 | ~~ ~~ | : | 93610105Sadam.leventhal@sun.com * | 0 .. 0 0 1 | | d_n-1 | 93710105Sadam.leventhal@sun.com * ~~ ~~ ~~ ~~ 93810105Sadam.leventhal@sun.com * 93910105Sadam.leventhal@sun.com * Note that I, V, d, and p are known. To compute D, we must invert the 94010105Sadam.leventhal@sun.com * matrix and use the known data and parity values to reconstruct the unknown 94110105Sadam.leventhal@sun.com * data values. We begin by removing the rows in V|I and d|p that correspond 94210105Sadam.leventhal@sun.com * to failed or missing columns; we then make V|I square (n x n) and d|p 94310105Sadam.leventhal@sun.com * sized n by removing rows corresponding to unused parity from the bottom up 94410105Sadam.leventhal@sun.com * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 94510105Sadam.leventhal@sun.com * using Gauss-Jordan elimination. In the example below we use m=3 parity 94610105Sadam.leventhal@sun.com * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 94710105Sadam.leventhal@sun.com * __ __ 94810105Sadam.leventhal@sun.com * | 1 1 1 1 1 1 1 1 | 94910105Sadam.leventhal@sun.com * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 95010105Sadam.leventhal@sun.com * | 19 205 116 29 64 16 4 1 | / / 95110105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 | / / 95210105Sadam.leventhal@sun.com * | 0 1 0 0 0 0 0 0 | <--' / 95310105Sadam.leventhal@sun.com * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 95410105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 | 95510105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 | 95610105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 | 95710105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 | 95810105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 | 95910105Sadam.leventhal@sun.com * ~~ ~~ 96010105Sadam.leventhal@sun.com * __ __ 96110105Sadam.leventhal@sun.com * | 1 1 1 1 1 1 1 1 | 96210105Sadam.leventhal@sun.com * | 128 64 32 16 8 4 2 1 | 96310105Sadam.leventhal@sun.com * | 19 205 116 29 64 16 4 1 | 96410105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 | 96510105Sadam.leventhal@sun.com * | 0 1 0 0 0 0 0 0 | 96610105Sadam.leventhal@sun.com * (V|I)' = | 0 0 1 0 0 0 0 0 | 96710105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 | 96810105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 | 96910105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 | 97010105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 | 97110105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 | 97210105Sadam.leventhal@sun.com * ~~ ~~ 97310105Sadam.leventhal@sun.com * 97410105Sadam.leventhal@sun.com * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 97510105Sadam.leventhal@sun.com * have carefully chosen the seed values 1, 2, and 4 to ensure that this 97610105Sadam.leventhal@sun.com * matrix is not singular. 97710105Sadam.leventhal@sun.com * __ __ 97810105Sadam.leventhal@sun.com * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 97910105Sadam.leventhal@sun.com * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 98010105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 98110105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 98210105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 98310105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 98410105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 98510105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 98610105Sadam.leventhal@sun.com * ~~ ~~ 98710105Sadam.leventhal@sun.com * __ __ 98810105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 98910105Sadam.leventhal@sun.com * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 99010105Sadam.leventhal@sun.com * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 99110105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 99210105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 99310105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 99410105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 99510105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 99610105Sadam.leventhal@sun.com * ~~ ~~ 99710105Sadam.leventhal@sun.com * __ __ 99810105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 99910105Sadam.leventhal@sun.com * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 100010105Sadam.leventhal@sun.com * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 100110105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 100210105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 100310105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 100410105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 100510105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 100610105Sadam.leventhal@sun.com * ~~ ~~ 100710105Sadam.leventhal@sun.com * __ __ 100810105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 100910105Sadam.leventhal@sun.com * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 101010105Sadam.leventhal@sun.com * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 101110105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 101210105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 101310105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 101410105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 101510105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 101610105Sadam.leventhal@sun.com * ~~ ~~ 101710105Sadam.leventhal@sun.com * __ __ 101810105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 101910105Sadam.leventhal@sun.com * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 102010105Sadam.leventhal@sun.com * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 102110105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 102210105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 102310105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 102410105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 102510105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 102610105Sadam.leventhal@sun.com * ~~ ~~ 102710105Sadam.leventhal@sun.com * __ __ 102810105Sadam.leventhal@sun.com * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 102910105Sadam.leventhal@sun.com * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 103010105Sadam.leventhal@sun.com * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 103110105Sadam.leventhal@sun.com * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 103210105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 103310105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 103410105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 103510105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 103610105Sadam.leventhal@sun.com * ~~ ~~ 103710105Sadam.leventhal@sun.com * __ __ 103810105Sadam.leventhal@sun.com * | 0 0 1 0 0 0 0 0 | 103910105Sadam.leventhal@sun.com * | 167 100 5 41 159 169 217 208 | 104010105Sadam.leventhal@sun.com * | 166 100 4 40 158 168 216 209 | 104110105Sadam.leventhal@sun.com * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 104210105Sadam.leventhal@sun.com * | 0 0 0 0 1 0 0 0 | 104310105Sadam.leventhal@sun.com * | 0 0 0 0 0 1 0 0 | 104410105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 1 0 | 104510105Sadam.leventhal@sun.com * | 0 0 0 0 0 0 0 1 | 104610105Sadam.leventhal@sun.com * ~~ ~~ 104710105Sadam.leventhal@sun.com * 104810105Sadam.leventhal@sun.com * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 104910105Sadam.leventhal@sun.com * of the missing data. 105010105Sadam.leventhal@sun.com * 105110105Sadam.leventhal@sun.com * As is apparent from the example above, the only non-trivial rows in the 105210105Sadam.leventhal@sun.com * inverse matrix correspond to the data disks that we're trying to 105310105Sadam.leventhal@sun.com * reconstruct. Indeed, those are the only rows we need as the others would 105410105Sadam.leventhal@sun.com * only be useful for reconstructing data known or assumed to be valid. For 105510105Sadam.leventhal@sun.com * that reason, we only build the coefficients in the rows that correspond to 105610105Sadam.leventhal@sun.com * targeted columns. 105710105Sadam.leventhal@sun.com */ 105810105Sadam.leventhal@sun.com /* END CSTYLED */ 105910105Sadam.leventhal@sun.com 106010105Sadam.leventhal@sun.com static void 106110105Sadam.leventhal@sun.com vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 106210105Sadam.leventhal@sun.com uint8_t **rows) 106310105Sadam.leventhal@sun.com { 106410105Sadam.leventhal@sun.com int i, j; 106510105Sadam.leventhal@sun.com int pow; 106610105Sadam.leventhal@sun.com 106710105Sadam.leventhal@sun.com ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 106810105Sadam.leventhal@sun.com 106910105Sadam.leventhal@sun.com /* 107010105Sadam.leventhal@sun.com * Fill in the missing rows of interest. 107110105Sadam.leventhal@sun.com */ 107210105Sadam.leventhal@sun.com for (i = 0; i < nmap; i++) { 107310105Sadam.leventhal@sun.com ASSERT3S(0, <=, map[i]); 107410105Sadam.leventhal@sun.com ASSERT3S(map[i], <=, 2); 107510105Sadam.leventhal@sun.com 107610105Sadam.leventhal@sun.com pow = map[i] * n; 107710105Sadam.leventhal@sun.com if (pow > 255) 107810105Sadam.leventhal@sun.com pow -= 255; 107910105Sadam.leventhal@sun.com ASSERT(pow <= 255); 108010105Sadam.leventhal@sun.com 108110105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 108210105Sadam.leventhal@sun.com pow -= map[i]; 108310105Sadam.leventhal@sun.com if (pow < 0) 108410105Sadam.leventhal@sun.com pow += 255; 108510105Sadam.leventhal@sun.com rows[i][j] = vdev_raidz_pow2[pow]; 108610105Sadam.leventhal@sun.com } 108710105Sadam.leventhal@sun.com } 10882082Seschrock } 10892082Seschrock 109010105Sadam.leventhal@sun.com static void 109110105Sadam.leventhal@sun.com vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 109210105Sadam.leventhal@sun.com uint8_t **rows, uint8_t **invrows, const uint8_t *used) 109310105Sadam.leventhal@sun.com { 109410105Sadam.leventhal@sun.com int i, j, ii, jj; 109510105Sadam.leventhal@sun.com uint8_t log; 109610105Sadam.leventhal@sun.com 109710105Sadam.leventhal@sun.com /* 109810105Sadam.leventhal@sun.com * Assert that the first nmissing entries from the array of used 109910105Sadam.leventhal@sun.com * columns correspond to parity columns and that subsequent entries 110010105Sadam.leventhal@sun.com * correspond to data columns. 110110105Sadam.leventhal@sun.com */ 110210105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 110310105Sadam.leventhal@sun.com ASSERT3S(used[i], <, rm->rm_firstdatacol); 110410105Sadam.leventhal@sun.com } 110510105Sadam.leventhal@sun.com for (; i < n; i++) { 110610105Sadam.leventhal@sun.com ASSERT3S(used[i], >=, rm->rm_firstdatacol); 110710105Sadam.leventhal@sun.com } 110810105Sadam.leventhal@sun.com 110910105Sadam.leventhal@sun.com /* 111010105Sadam.leventhal@sun.com * First initialize the storage where we'll compute the inverse rows. 111110105Sadam.leventhal@sun.com */ 111210105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 111310105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 111410105Sadam.leventhal@sun.com invrows[i][j] = (i == j) ? 1 : 0; 111510105Sadam.leventhal@sun.com } 111610105Sadam.leventhal@sun.com } 111710105Sadam.leventhal@sun.com 111810105Sadam.leventhal@sun.com /* 111910105Sadam.leventhal@sun.com * Subtract all trivial rows from the rows of consequence. 112010105Sadam.leventhal@sun.com */ 112110105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 112210105Sadam.leventhal@sun.com for (j = nmissing; j < n; j++) { 112310105Sadam.leventhal@sun.com ASSERT3U(used[j], >=, rm->rm_firstdatacol); 112410105Sadam.leventhal@sun.com jj = used[j] - rm->rm_firstdatacol; 112510105Sadam.leventhal@sun.com ASSERT3S(jj, <, n); 112610105Sadam.leventhal@sun.com invrows[i][j] = rows[i][jj]; 112710105Sadam.leventhal@sun.com rows[i][jj] = 0; 112810105Sadam.leventhal@sun.com } 112910105Sadam.leventhal@sun.com } 113010105Sadam.leventhal@sun.com 113110105Sadam.leventhal@sun.com /* 113210105Sadam.leventhal@sun.com * For each of the rows of interest, we must normalize it and subtract 113310105Sadam.leventhal@sun.com * a multiple of it from the other rows. 113410105Sadam.leventhal@sun.com */ 113510105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 113610105Sadam.leventhal@sun.com for (j = 0; j < missing[i]; j++) { 113710105Sadam.leventhal@sun.com ASSERT3U(rows[i][j], ==, 0); 113810105Sadam.leventhal@sun.com } 113910105Sadam.leventhal@sun.com ASSERT3U(rows[i][missing[i]], !=, 0); 114010105Sadam.leventhal@sun.com 114110105Sadam.leventhal@sun.com /* 114210105Sadam.leventhal@sun.com * Compute the inverse of the first element and multiply each 114310105Sadam.leventhal@sun.com * element in the row by that value. 114410105Sadam.leventhal@sun.com */ 114510105Sadam.leventhal@sun.com log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 114610105Sadam.leventhal@sun.com 114710105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 114810105Sadam.leventhal@sun.com rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 114910105Sadam.leventhal@sun.com invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 115010105Sadam.leventhal@sun.com } 115110105Sadam.leventhal@sun.com 115210105Sadam.leventhal@sun.com for (ii = 0; ii < nmissing; ii++) { 115310105Sadam.leventhal@sun.com if (i == ii) 115410105Sadam.leventhal@sun.com continue; 115510105Sadam.leventhal@sun.com 115610105Sadam.leventhal@sun.com ASSERT3U(rows[ii][missing[i]], !=, 0); 115710105Sadam.leventhal@sun.com 115810105Sadam.leventhal@sun.com log = vdev_raidz_log2[rows[ii][missing[i]]]; 115910105Sadam.leventhal@sun.com 116010105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 116110105Sadam.leventhal@sun.com rows[ii][j] ^= 116210105Sadam.leventhal@sun.com vdev_raidz_exp2(rows[i][j], log); 116310105Sadam.leventhal@sun.com invrows[ii][j] ^= 116410105Sadam.leventhal@sun.com vdev_raidz_exp2(invrows[i][j], log); 116510105Sadam.leventhal@sun.com } 116610105Sadam.leventhal@sun.com } 116710105Sadam.leventhal@sun.com } 116810105Sadam.leventhal@sun.com 116910105Sadam.leventhal@sun.com /* 117010105Sadam.leventhal@sun.com * Verify that the data that is left in the rows are properly part of 117110105Sadam.leventhal@sun.com * an identity matrix. 117210105Sadam.leventhal@sun.com */ 117310105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 117410105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 117510105Sadam.leventhal@sun.com if (j == missing[i]) { 117610105Sadam.leventhal@sun.com ASSERT3U(rows[i][j], ==, 1); 117710105Sadam.leventhal@sun.com } else { 117810105Sadam.leventhal@sun.com ASSERT3U(rows[i][j], ==, 0); 117910105Sadam.leventhal@sun.com } 118010105Sadam.leventhal@sun.com } 118110105Sadam.leventhal@sun.com } 118210105Sadam.leventhal@sun.com } 118310105Sadam.leventhal@sun.com 118410105Sadam.leventhal@sun.com static void 118510105Sadam.leventhal@sun.com vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 118610105Sadam.leventhal@sun.com int *missing, uint8_t **invrows, const uint8_t *used) 118710105Sadam.leventhal@sun.com { 118810105Sadam.leventhal@sun.com int i, j, x, cc, c; 118910105Sadam.leventhal@sun.com uint8_t *src; 119010105Sadam.leventhal@sun.com uint64_t ccount; 119110105Sadam.leventhal@sun.com uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 119210105Sadam.leventhal@sun.com uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 119310105Sadam.leventhal@sun.com uint8_t log, val; 119410105Sadam.leventhal@sun.com int ll; 119510105Sadam.leventhal@sun.com uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 119610105Sadam.leventhal@sun.com uint8_t *p, *pp; 119710105Sadam.leventhal@sun.com size_t psize; 119810105Sadam.leventhal@sun.com 119910105Sadam.leventhal@sun.com psize = sizeof (invlog[0][0]) * n * nmissing; 120010105Sadam.leventhal@sun.com p = kmem_alloc(psize, KM_SLEEP); 120110105Sadam.leventhal@sun.com 120210105Sadam.leventhal@sun.com for (pp = p, i = 0; i < nmissing; i++) { 120310105Sadam.leventhal@sun.com invlog[i] = pp; 120410105Sadam.leventhal@sun.com pp += n; 120510105Sadam.leventhal@sun.com } 120610105Sadam.leventhal@sun.com 120710105Sadam.leventhal@sun.com for (i = 0; i < nmissing; i++) { 120810105Sadam.leventhal@sun.com for (j = 0; j < n; j++) { 120910105Sadam.leventhal@sun.com ASSERT3U(invrows[i][j], !=, 0); 121010105Sadam.leventhal@sun.com invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 121110105Sadam.leventhal@sun.com } 121210105Sadam.leventhal@sun.com } 121310105Sadam.leventhal@sun.com 121410105Sadam.leventhal@sun.com for (i = 0; i < n; i++) { 121510105Sadam.leventhal@sun.com c = used[i]; 121610105Sadam.leventhal@sun.com ASSERT3U(c, <, rm->rm_cols); 121710105Sadam.leventhal@sun.com 121810105Sadam.leventhal@sun.com src = rm->rm_col[c].rc_data; 121910105Sadam.leventhal@sun.com ccount = rm->rm_col[c].rc_size; 122010105Sadam.leventhal@sun.com for (j = 0; j < nmissing; j++) { 122110105Sadam.leventhal@sun.com cc = missing[j] + rm->rm_firstdatacol; 122210105Sadam.leventhal@sun.com ASSERT3U(cc, >=, rm->rm_firstdatacol); 122310105Sadam.leventhal@sun.com ASSERT3U(cc, <, rm->rm_cols); 122410105Sadam.leventhal@sun.com ASSERT3U(cc, !=, c); 122510105Sadam.leventhal@sun.com 122610105Sadam.leventhal@sun.com dst[j] = rm->rm_col[cc].rc_data; 122710105Sadam.leventhal@sun.com dcount[j] = rm->rm_col[cc].rc_size; 122810105Sadam.leventhal@sun.com } 122910105Sadam.leventhal@sun.com 123010105Sadam.leventhal@sun.com ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 123110105Sadam.leventhal@sun.com 123210105Sadam.leventhal@sun.com for (x = 0; x < ccount; x++, src++) { 123310105Sadam.leventhal@sun.com if (*src != 0) 123410105Sadam.leventhal@sun.com log = vdev_raidz_log2[*src]; 123510105Sadam.leventhal@sun.com 123610105Sadam.leventhal@sun.com for (cc = 0; cc < nmissing; cc++) { 123710105Sadam.leventhal@sun.com if (x >= dcount[cc]) 123810105Sadam.leventhal@sun.com continue; 123910105Sadam.leventhal@sun.com 124010105Sadam.leventhal@sun.com if (*src == 0) { 124110105Sadam.leventhal@sun.com val = 0; 124210105Sadam.leventhal@sun.com } else { 124310105Sadam.leventhal@sun.com if ((ll = log + invlog[cc][i]) >= 255) 124410105Sadam.leventhal@sun.com ll -= 255; 124510105Sadam.leventhal@sun.com val = vdev_raidz_pow2[ll]; 124610105Sadam.leventhal@sun.com } 124710105Sadam.leventhal@sun.com 124810105Sadam.leventhal@sun.com if (i == 0) 124910105Sadam.leventhal@sun.com dst[cc][x] = val; 125010105Sadam.leventhal@sun.com else 125110105Sadam.leventhal@sun.com dst[cc][x] ^= val; 125210105Sadam.leventhal@sun.com } 125310105Sadam.leventhal@sun.com } 125410105Sadam.leventhal@sun.com } 125510105Sadam.leventhal@sun.com 125610105Sadam.leventhal@sun.com kmem_free(p, psize); 125710105Sadam.leventhal@sun.com } 125810105Sadam.leventhal@sun.com 125910105Sadam.leventhal@sun.com static int 126010105Sadam.leventhal@sun.com vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 126110105Sadam.leventhal@sun.com { 126210105Sadam.leventhal@sun.com int n, i, c, t, tt; 126310105Sadam.leventhal@sun.com int nmissing_rows; 126410105Sadam.leventhal@sun.com int missing_rows[VDEV_RAIDZ_MAXPARITY]; 126510105Sadam.leventhal@sun.com int parity_map[VDEV_RAIDZ_MAXPARITY]; 126610105Sadam.leventhal@sun.com 126710105Sadam.leventhal@sun.com uint8_t *p, *pp; 126810105Sadam.leventhal@sun.com size_t psize; 126910105Sadam.leventhal@sun.com 127010105Sadam.leventhal@sun.com uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 127110105Sadam.leventhal@sun.com uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 127210105Sadam.leventhal@sun.com uint8_t *used; 127310105Sadam.leventhal@sun.com 127410105Sadam.leventhal@sun.com int code = 0; 127510105Sadam.leventhal@sun.com 127610105Sadam.leventhal@sun.com 127710105Sadam.leventhal@sun.com n = rm->rm_cols - rm->rm_firstdatacol; 127810105Sadam.leventhal@sun.com 127910105Sadam.leventhal@sun.com /* 128010105Sadam.leventhal@sun.com * Figure out which data columns are missing. 128110105Sadam.leventhal@sun.com */ 128210105Sadam.leventhal@sun.com nmissing_rows = 0; 128310105Sadam.leventhal@sun.com for (t = 0; t < ntgts; t++) { 128410105Sadam.leventhal@sun.com if (tgts[t] >= rm->rm_firstdatacol) { 128510105Sadam.leventhal@sun.com missing_rows[nmissing_rows++] = 128610105Sadam.leventhal@sun.com tgts[t] - rm->rm_firstdatacol; 128710105Sadam.leventhal@sun.com } 128810105Sadam.leventhal@sun.com } 128910105Sadam.leventhal@sun.com 129010105Sadam.leventhal@sun.com /* 129110105Sadam.leventhal@sun.com * Figure out which parity columns to use to help generate the missing 129210105Sadam.leventhal@sun.com * data columns. 129310105Sadam.leventhal@sun.com */ 129410105Sadam.leventhal@sun.com for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 129510105Sadam.leventhal@sun.com ASSERT(tt < ntgts); 129610105Sadam.leventhal@sun.com ASSERT(c < rm->rm_firstdatacol); 129710105Sadam.leventhal@sun.com 129810105Sadam.leventhal@sun.com /* 129910105Sadam.leventhal@sun.com * Skip any targeted parity columns. 130010105Sadam.leventhal@sun.com */ 130110105Sadam.leventhal@sun.com if (c == tgts[tt]) { 130210105Sadam.leventhal@sun.com tt++; 130310105Sadam.leventhal@sun.com continue; 130410105Sadam.leventhal@sun.com } 130510105Sadam.leventhal@sun.com 130610105Sadam.leventhal@sun.com code |= 1 << c; 130710105Sadam.leventhal@sun.com 130810105Sadam.leventhal@sun.com parity_map[i] = c; 130910105Sadam.leventhal@sun.com i++; 131010105Sadam.leventhal@sun.com } 131110105Sadam.leventhal@sun.com 131210105Sadam.leventhal@sun.com ASSERT(code != 0); 131310105Sadam.leventhal@sun.com ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 131410105Sadam.leventhal@sun.com 131510105Sadam.leventhal@sun.com psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 131610105Sadam.leventhal@sun.com nmissing_rows * n + sizeof (used[0]) * n; 131710105Sadam.leventhal@sun.com p = kmem_alloc(psize, KM_SLEEP); 131810105Sadam.leventhal@sun.com 131910105Sadam.leventhal@sun.com for (pp = p, i = 0; i < nmissing_rows; i++) { 132010105Sadam.leventhal@sun.com rows[i] = pp; 132110105Sadam.leventhal@sun.com pp += n; 132210105Sadam.leventhal@sun.com invrows[i] = pp; 132310105Sadam.leventhal@sun.com pp += n; 132410105Sadam.leventhal@sun.com } 132510105Sadam.leventhal@sun.com used = pp; 132610105Sadam.leventhal@sun.com 132710105Sadam.leventhal@sun.com for (i = 0; i < nmissing_rows; i++) { 132810105Sadam.leventhal@sun.com used[i] = parity_map[i]; 132910105Sadam.leventhal@sun.com } 133010105Sadam.leventhal@sun.com 133110105Sadam.leventhal@sun.com for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 133210105Sadam.leventhal@sun.com if (tt < nmissing_rows && 133310105Sadam.leventhal@sun.com c == missing_rows[tt] + rm->rm_firstdatacol) { 133410105Sadam.leventhal@sun.com tt++; 133510105Sadam.leventhal@sun.com continue; 133610105Sadam.leventhal@sun.com } 133710105Sadam.leventhal@sun.com 133810105Sadam.leventhal@sun.com ASSERT3S(i, <, n); 133910105Sadam.leventhal@sun.com used[i] = c; 134010105Sadam.leventhal@sun.com i++; 134110105Sadam.leventhal@sun.com } 134210105Sadam.leventhal@sun.com 134310105Sadam.leventhal@sun.com /* 134410105Sadam.leventhal@sun.com * Initialize the interesting rows of the matrix. 134510105Sadam.leventhal@sun.com */ 134610105Sadam.leventhal@sun.com vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 134710105Sadam.leventhal@sun.com 134810105Sadam.leventhal@sun.com /* 134910105Sadam.leventhal@sun.com * Invert the matrix. 135010105Sadam.leventhal@sun.com */ 135110105Sadam.leventhal@sun.com vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 135210105Sadam.leventhal@sun.com invrows, used); 135310105Sadam.leventhal@sun.com 135410105Sadam.leventhal@sun.com /* 135510105Sadam.leventhal@sun.com * Reconstruct the missing data using the generated matrix. 135610105Sadam.leventhal@sun.com */ 135710105Sadam.leventhal@sun.com vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 135810105Sadam.leventhal@sun.com invrows, used); 135910105Sadam.leventhal@sun.com 136010105Sadam.leventhal@sun.com kmem_free(p, psize); 136110105Sadam.leventhal@sun.com 136210105Sadam.leventhal@sun.com return (code); 136310105Sadam.leventhal@sun.com } 136410105Sadam.leventhal@sun.com 136510105Sadam.leventhal@sun.com static int 136610105Sadam.leventhal@sun.com vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 136710105Sadam.leventhal@sun.com { 136810105Sadam.leventhal@sun.com int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 136910105Sadam.leventhal@sun.com int ntgts; 137010105Sadam.leventhal@sun.com int i, c; 137110105Sadam.leventhal@sun.com int code; 137210105Sadam.leventhal@sun.com int nbadparity, nbaddata; 137310105Sadam.leventhal@sun.com int parity_valid[VDEV_RAIDZ_MAXPARITY]; 137410105Sadam.leventhal@sun.com 137510105Sadam.leventhal@sun.com /* 137610105Sadam.leventhal@sun.com * The tgts list must already be sorted. 137710105Sadam.leventhal@sun.com */ 137810105Sadam.leventhal@sun.com for (i = 1; i < nt; i++) { 137910105Sadam.leventhal@sun.com ASSERT(t[i] > t[i - 1]); 138010105Sadam.leventhal@sun.com } 138110105Sadam.leventhal@sun.com 138210105Sadam.leventhal@sun.com nbadparity = rm->rm_firstdatacol; 138310105Sadam.leventhal@sun.com nbaddata = rm->rm_cols - nbadparity; 138410105Sadam.leventhal@sun.com ntgts = 0; 138510105Sadam.leventhal@sun.com for (i = 0, c = 0; c < rm->rm_cols; c++) { 138610105Sadam.leventhal@sun.com if (c < rm->rm_firstdatacol) 138710105Sadam.leventhal@sun.com parity_valid[c] = B_FALSE; 138810105Sadam.leventhal@sun.com 138910105Sadam.leventhal@sun.com if (i < nt && c == t[i]) { 139010105Sadam.leventhal@sun.com tgts[ntgts++] = c; 139110105Sadam.leventhal@sun.com i++; 139210105Sadam.leventhal@sun.com } else if (rm->rm_col[c].rc_error != 0) { 139310105Sadam.leventhal@sun.com tgts[ntgts++] = c; 139410105Sadam.leventhal@sun.com } else if (c >= rm->rm_firstdatacol) { 139510105Sadam.leventhal@sun.com nbaddata--; 139610105Sadam.leventhal@sun.com } else { 139710105Sadam.leventhal@sun.com parity_valid[c] = B_TRUE; 139810105Sadam.leventhal@sun.com nbadparity--; 139910105Sadam.leventhal@sun.com } 140010105Sadam.leventhal@sun.com } 140110105Sadam.leventhal@sun.com 140210105Sadam.leventhal@sun.com ASSERT(ntgts >= nt); 140310105Sadam.leventhal@sun.com ASSERT(nbaddata >= 0); 140410105Sadam.leventhal@sun.com ASSERT(nbaddata + nbadparity == ntgts); 140510105Sadam.leventhal@sun.com 140610105Sadam.leventhal@sun.com dt = &tgts[nbadparity]; 140710105Sadam.leventhal@sun.com 140810105Sadam.leventhal@sun.com /* 140910105Sadam.leventhal@sun.com * See if we can use any of our optimized reconstruction routines. 141010105Sadam.leventhal@sun.com */ 141110105Sadam.leventhal@sun.com if (!vdev_raidz_default_to_general) { 141210105Sadam.leventhal@sun.com switch (nbaddata) { 141310105Sadam.leventhal@sun.com case 1: 141410105Sadam.leventhal@sun.com if (parity_valid[VDEV_RAIDZ_P]) 141510105Sadam.leventhal@sun.com return (vdev_raidz_reconstruct_p(rm, dt, 1)); 141610105Sadam.leventhal@sun.com 141710105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol > 1); 141810105Sadam.leventhal@sun.com 141910105Sadam.leventhal@sun.com if (parity_valid[VDEV_RAIDZ_Q]) 142010105Sadam.leventhal@sun.com return (vdev_raidz_reconstruct_q(rm, dt, 1)); 142110105Sadam.leventhal@sun.com 142210105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol > 2); 142310105Sadam.leventhal@sun.com break; 142410105Sadam.leventhal@sun.com 142510105Sadam.leventhal@sun.com case 2: 142610105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol > 1); 142710105Sadam.leventhal@sun.com 142810105Sadam.leventhal@sun.com if (parity_valid[VDEV_RAIDZ_P] && 142910105Sadam.leventhal@sun.com parity_valid[VDEV_RAIDZ_Q]) 143010105Sadam.leventhal@sun.com return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 143110105Sadam.leventhal@sun.com 143210105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol > 2); 143310105Sadam.leventhal@sun.com 143410105Sadam.leventhal@sun.com break; 143510105Sadam.leventhal@sun.com } 143610105Sadam.leventhal@sun.com } 143710105Sadam.leventhal@sun.com 143810105Sadam.leventhal@sun.com code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 143910105Sadam.leventhal@sun.com ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 144010105Sadam.leventhal@sun.com ASSERT(code > 0); 144110105Sadam.leventhal@sun.com return (code); 144210105Sadam.leventhal@sun.com } 14432082Seschrock 1444789Sahrens static int 1445789Sahrens vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) 1446789Sahrens { 144710105Sadam.leventhal@sun.com vdev_t *cvd; 14482082Seschrock uint64_t nparity = vd->vdev_nparity; 144910105Sadam.leventhal@sun.com int c; 1450789Sahrens int lasterror = 0; 1451789Sahrens int numerrors = 0; 1452789Sahrens 14532082Seschrock ASSERT(nparity > 0); 14542082Seschrock 14552082Seschrock if (nparity > VDEV_RAIDZ_MAXPARITY || 14562082Seschrock vd->vdev_children < nparity + 1) { 1457789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1458789Sahrens return (EINVAL); 1459789Sahrens } 1460789Sahrens 14619846SEric.Taylor@Sun.COM vdev_open_children(vd); 1462789Sahrens 146310105Sadam.leventhal@sun.com for (c = 0; c < vd->vdev_children; c++) { 146410105Sadam.leventhal@sun.com cvd = vd->vdev_child[c]; 14659846SEric.Taylor@Sun.COM 146610105Sadam.leventhal@sun.com if (cvd->vdev_open_error != 0) { 14679846SEric.Taylor@Sun.COM lasterror = cvd->vdev_open_error; 1468789Sahrens numerrors++; 1469789Sahrens continue; 1470789Sahrens } 1471789Sahrens 1472789Sahrens *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 14731732Sbonwick *ashift = MAX(*ashift, cvd->vdev_ashift); 1474789Sahrens } 1475789Sahrens 1476789Sahrens *asize *= vd->vdev_children; 1477789Sahrens 14782082Seschrock if (numerrors > nparity) { 1479789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1480789Sahrens return (lasterror); 1481789Sahrens } 1482789Sahrens 1483789Sahrens return (0); 1484789Sahrens } 1485789Sahrens 1486789Sahrens static void 1487789Sahrens vdev_raidz_close(vdev_t *vd) 1488789Sahrens { 148910105Sadam.leventhal@sun.com int c; 149010105Sadam.leventhal@sun.com 149110105Sadam.leventhal@sun.com for (c = 0; c < vd->vdev_children; c++) 1492789Sahrens vdev_close(vd->vdev_child[c]); 1493789Sahrens } 1494789Sahrens 1495789Sahrens static uint64_t 1496789Sahrens vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1497789Sahrens { 1498789Sahrens uint64_t asize; 14991732Sbonwick uint64_t ashift = vd->vdev_top->vdev_ashift; 1500789Sahrens uint64_t cols = vd->vdev_children; 15012082Seschrock uint64_t nparity = vd->vdev_nparity; 1502789Sahrens 15031732Sbonwick asize = ((psize - 1) >> ashift) + 1; 15042082Seschrock asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 15052082Seschrock asize = roundup(asize, nparity + 1) << ashift; 1506789Sahrens 1507789Sahrens return (asize); 1508789Sahrens } 1509789Sahrens 1510789Sahrens static void 1511789Sahrens vdev_raidz_child_done(zio_t *zio) 1512789Sahrens { 1513789Sahrens raidz_col_t *rc = zio->io_private; 1514789Sahrens 1515789Sahrens rc->rc_error = zio->io_error; 1516789Sahrens rc->rc_tried = 1; 1517789Sahrens rc->rc_skipped = 0; 1518789Sahrens } 1519789Sahrens 15205530Sbonwick static int 1521789Sahrens vdev_raidz_io_start(zio_t *zio) 1522789Sahrens { 1523789Sahrens vdev_t *vd = zio->io_vd; 15241732Sbonwick vdev_t *tvd = vd->vdev_top; 1525789Sahrens vdev_t *cvd; 1526789Sahrens raidz_map_t *rm; 1527789Sahrens raidz_col_t *rc; 152810105Sadam.leventhal@sun.com int c, i; 1529789Sahrens 15302082Seschrock rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, 15312082Seschrock vd->vdev_nparity); 1532789Sahrens 15331775Sbillm ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1534789Sahrens 1535789Sahrens if (zio->io_type == ZIO_TYPE_WRITE) { 153610105Sadam.leventhal@sun.com vdev_raidz_generate_parity(rm); 1537789Sahrens 1538789Sahrens for (c = 0; c < rm->rm_cols; c++) { 1539789Sahrens rc = &rm->rm_col[c]; 15402082Seschrock cvd = vd->vdev_child[rc->rc_devidx]; 1541789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1542789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 15437754SJeff.Bonwick@Sun.COM zio->io_type, zio->io_priority, 0, 1544789Sahrens vdev_raidz_child_done, rc)); 1545789Sahrens } 15465530Sbonwick 154710105Sadam.leventhal@sun.com /* 154810105Sadam.leventhal@sun.com * Generate optional I/Os for any skipped sectors to improve 154910105Sadam.leventhal@sun.com * aggregation contiguity. 155010105Sadam.leventhal@sun.com */ 155110450Sadam.leventhal@sun.com for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 155210105Sadam.leventhal@sun.com ASSERT(c <= rm->rm_scols); 155310105Sadam.leventhal@sun.com if (c == rm->rm_scols) 155410105Sadam.leventhal@sun.com c = 0; 155510105Sadam.leventhal@sun.com rc = &rm->rm_col[c]; 155610105Sadam.leventhal@sun.com cvd = vd->vdev_child[rc->rc_devidx]; 155710105Sadam.leventhal@sun.com zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 155810105Sadam.leventhal@sun.com rc->rc_offset + rc->rc_size, NULL, 155910105Sadam.leventhal@sun.com 1 << tvd->vdev_ashift, 156010105Sadam.leventhal@sun.com zio->io_type, zio->io_priority, 156110105Sadam.leventhal@sun.com ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 156210105Sadam.leventhal@sun.com } 156310105Sadam.leventhal@sun.com 15647754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 1565789Sahrens } 1566789Sahrens 1567789Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ); 1568789Sahrens 15692082Seschrock /* 15702082Seschrock * Iterate over the columns in reverse order so that we hit the parity 157110105Sadam.leventhal@sun.com * last -- any errors along the way will force us to read the parity. 15722082Seschrock */ 1573789Sahrens for (c = rm->rm_cols - 1; c >= 0; c--) { 1574789Sahrens rc = &rm->rm_col[c]; 15752082Seschrock cvd = vd->vdev_child[rc->rc_devidx]; 15765329Sgw25295 if (!vdev_readable(cvd)) { 15772082Seschrock if (c >= rm->rm_firstdatacol) 15782082Seschrock rm->rm_missingdata++; 15792082Seschrock else 15802082Seschrock rm->rm_missingparity++; 1581789Sahrens rc->rc_error = ENXIO; 1582789Sahrens rc->rc_tried = 1; /* don't even try */ 1583789Sahrens rc->rc_skipped = 1; 1584789Sahrens continue; 1585789Sahrens } 158610922SJeff.Bonwick@Sun.COM if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 15872082Seschrock if (c >= rm->rm_firstdatacol) 15882082Seschrock rm->rm_missingdata++; 15892082Seschrock else 15902082Seschrock rm->rm_missingparity++; 1591789Sahrens rc->rc_error = ESTALE; 1592789Sahrens rc->rc_skipped = 1; 1593789Sahrens continue; 1594789Sahrens } 15952082Seschrock if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 15969434SMark.Musante@Sun.COM (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1597789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1598789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 15997754SJeff.Bonwick@Sun.COM zio->io_type, zio->io_priority, 0, 1600789Sahrens vdev_raidz_child_done, rc)); 1601789Sahrens } 1602789Sahrens } 1603789Sahrens 16047754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 1605789Sahrens } 1606789Sahrens 16071544Seschrock /* 16081544Seschrock * Report a checksum error for a child of a RAID-Z device. 16091544Seschrock */ 16101544Seschrock static void 161110614SJonathan.Adams@Sun.COM raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) 16121544Seschrock { 16132082Seschrock vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 16141544Seschrock 16151544Seschrock if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 161610614SJonathan.Adams@Sun.COM zio_bad_cksum_t zbc; 161710614SJonathan.Adams@Sun.COM raidz_map_t *rm = zio->io_vsd; 161810614SJonathan.Adams@Sun.COM 16191544Seschrock mutex_enter(&vd->vdev_stat_lock); 16201544Seschrock vd->vdev_stat.vs_checksum_errors++; 16211544Seschrock mutex_exit(&vd->vdev_stat_lock); 162210614SJonathan.Adams@Sun.COM 162310614SJonathan.Adams@Sun.COM zbc.zbc_has_cksum = 0; 162410614SJonathan.Adams@Sun.COM zbc.zbc_injected = rm->rm_ecksuminjected; 162510614SJonathan.Adams@Sun.COM 162610614SJonathan.Adams@Sun.COM zfs_ereport_post_checksum(zio->io_spa, vd, zio, 162710614SJonathan.Adams@Sun.COM rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, 162810614SJonathan.Adams@Sun.COM &zbc); 16291544Seschrock } 163010614SJonathan.Adams@Sun.COM } 16311544Seschrock 163210614SJonathan.Adams@Sun.COM /* 163310614SJonathan.Adams@Sun.COM * We keep track of whether or not there were any injected errors, so that 163410614SJonathan.Adams@Sun.COM * any ereports we generate can note it. 163510614SJonathan.Adams@Sun.COM */ 163610614SJonathan.Adams@Sun.COM static int 163710614SJonathan.Adams@Sun.COM raidz_checksum_verify(zio_t *zio) 163810614SJonathan.Adams@Sun.COM { 163910614SJonathan.Adams@Sun.COM zio_bad_cksum_t zbc; 164010614SJonathan.Adams@Sun.COM raidz_map_t *rm = zio->io_vsd; 164110614SJonathan.Adams@Sun.COM 164210614SJonathan.Adams@Sun.COM int ret = zio_checksum_error(zio, &zbc); 164310614SJonathan.Adams@Sun.COM if (ret != 0 && zbc.zbc_injected != 0) 164410614SJonathan.Adams@Sun.COM rm->rm_ecksuminjected = 1; 164510614SJonathan.Adams@Sun.COM 164610614SJonathan.Adams@Sun.COM return (ret); 16471544Seschrock } 16481544Seschrock 16492082Seschrock /* 16502082Seschrock * Generate the parity from the data columns. If we tried and were able to 16512082Seschrock * read the parity without error, verify that the generated parity matches the 16522082Seschrock * data we read. If it doesn't, we fire off a checksum error. Return the 16532082Seschrock * number such failures. 16542082Seschrock */ 16552082Seschrock static int 16562082Seschrock raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 16572082Seschrock { 16582082Seschrock void *orig[VDEV_RAIDZ_MAXPARITY]; 16592082Seschrock int c, ret = 0; 16602082Seschrock raidz_col_t *rc; 16612082Seschrock 16622082Seschrock for (c = 0; c < rm->rm_firstdatacol; c++) { 16632082Seschrock rc = &rm->rm_col[c]; 16642082Seschrock if (!rc->rc_tried || rc->rc_error != 0) 16652082Seschrock continue; 16662082Seschrock orig[c] = zio_buf_alloc(rc->rc_size); 16672082Seschrock bcopy(rc->rc_data, orig[c], rc->rc_size); 16682082Seschrock } 16692082Seschrock 167010105Sadam.leventhal@sun.com vdev_raidz_generate_parity(rm); 16712082Seschrock 16722082Seschrock for (c = 0; c < rm->rm_firstdatacol; c++) { 16732082Seschrock rc = &rm->rm_col[c]; 16742082Seschrock if (!rc->rc_tried || rc->rc_error != 0) 16752082Seschrock continue; 16762082Seschrock if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 167710614SJonathan.Adams@Sun.COM raidz_checksum_error(zio, rc, orig[c]); 16782082Seschrock rc->rc_error = ECKSUM; 16792082Seschrock ret++; 16802082Seschrock } 16812082Seschrock zio_buf_free(orig[c], rc->rc_size); 16822082Seschrock } 16832082Seschrock 16842082Seschrock return (ret); 16852082Seschrock } 16862082Seschrock 168710105Sadam.leventhal@sun.com /* 168810105Sadam.leventhal@sun.com * Keep statistics on all the ways that we used parity to correct data. 168910105Sadam.leventhal@sun.com */ 169010105Sadam.leventhal@sun.com static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; 16911544Seschrock 16925530Sbonwick static int 16937754SJeff.Bonwick@Sun.COM vdev_raidz_worst_error(raidz_map_t *rm) 16947754SJeff.Bonwick@Sun.COM { 16957754SJeff.Bonwick@Sun.COM int error = 0; 16967754SJeff.Bonwick@Sun.COM 16977754SJeff.Bonwick@Sun.COM for (int c = 0; c < rm->rm_cols; c++) 16987754SJeff.Bonwick@Sun.COM error = zio_worst_error(error, rm->rm_col[c].rc_error); 16997754SJeff.Bonwick@Sun.COM 17007754SJeff.Bonwick@Sun.COM return (error); 17017754SJeff.Bonwick@Sun.COM } 17027754SJeff.Bonwick@Sun.COM 170310105Sadam.leventhal@sun.com /* 170410105Sadam.leventhal@sun.com * Iterate over all combinations of bad data and attempt a reconstruction. 170510105Sadam.leventhal@sun.com * Note that the algorithm below is non-optimal because it doesn't take into 170610105Sadam.leventhal@sun.com * account how reconstruction is actually performed. For example, with 170710105Sadam.leventhal@sun.com * triple-parity RAID-Z the reconstruction procedure is the same if column 4 170810105Sadam.leventhal@sun.com * is targeted as invalid as if columns 1 and 4 are targeted since in both 170910105Sadam.leventhal@sun.com * cases we'd only use parity information in column 0. 171010105Sadam.leventhal@sun.com */ 171110105Sadam.leventhal@sun.com static int 171210105Sadam.leventhal@sun.com vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 171310105Sadam.leventhal@sun.com { 171410105Sadam.leventhal@sun.com raidz_map_t *rm = zio->io_vsd; 171510105Sadam.leventhal@sun.com raidz_col_t *rc; 171610105Sadam.leventhal@sun.com void *orig[VDEV_RAIDZ_MAXPARITY]; 171710105Sadam.leventhal@sun.com int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 171810105Sadam.leventhal@sun.com int *tgts = &tstore[1]; 171910105Sadam.leventhal@sun.com int current, next, i, c, n; 172010105Sadam.leventhal@sun.com int code, ret = 0; 172110105Sadam.leventhal@sun.com 172210105Sadam.leventhal@sun.com ASSERT(total_errors < rm->rm_firstdatacol); 172310105Sadam.leventhal@sun.com 172410105Sadam.leventhal@sun.com /* 172510105Sadam.leventhal@sun.com * This simplifies one edge condition. 172610105Sadam.leventhal@sun.com */ 172710105Sadam.leventhal@sun.com tgts[-1] = -1; 172810105Sadam.leventhal@sun.com 172910105Sadam.leventhal@sun.com for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 173010105Sadam.leventhal@sun.com /* 173110105Sadam.leventhal@sun.com * Initialize the targets array by finding the first n columns 173210105Sadam.leventhal@sun.com * that contain no error. 173310105Sadam.leventhal@sun.com * 173410105Sadam.leventhal@sun.com * If there were no data errors, we need to ensure that we're 173510105Sadam.leventhal@sun.com * always explicitly attempting to reconstruct at least one 173610105Sadam.leventhal@sun.com * data column. To do this, we simply push the highest target 173710105Sadam.leventhal@sun.com * up into the data columns. 173810105Sadam.leventhal@sun.com */ 173910105Sadam.leventhal@sun.com for (c = 0, i = 0; i < n; i++) { 174010105Sadam.leventhal@sun.com if (i == n - 1 && data_errors == 0 && 174110105Sadam.leventhal@sun.com c < rm->rm_firstdatacol) { 174210105Sadam.leventhal@sun.com c = rm->rm_firstdatacol; 174310105Sadam.leventhal@sun.com } 174410105Sadam.leventhal@sun.com 174510105Sadam.leventhal@sun.com while (rm->rm_col[c].rc_error != 0) { 174610105Sadam.leventhal@sun.com c++; 174710105Sadam.leventhal@sun.com ASSERT3S(c, <, rm->rm_cols); 174810105Sadam.leventhal@sun.com } 174910105Sadam.leventhal@sun.com 175010105Sadam.leventhal@sun.com tgts[i] = c++; 175110105Sadam.leventhal@sun.com } 175210105Sadam.leventhal@sun.com 175310105Sadam.leventhal@sun.com /* 175410105Sadam.leventhal@sun.com * Setting tgts[n] simplifies the other edge condition. 175510105Sadam.leventhal@sun.com */ 175610105Sadam.leventhal@sun.com tgts[n] = rm->rm_cols; 175710105Sadam.leventhal@sun.com 175810105Sadam.leventhal@sun.com /* 175910105Sadam.leventhal@sun.com * These buffers were allocated in previous iterations. 176010105Sadam.leventhal@sun.com */ 176110105Sadam.leventhal@sun.com for (i = 0; i < n - 1; i++) { 176210105Sadam.leventhal@sun.com ASSERT(orig[i] != NULL); 176310105Sadam.leventhal@sun.com } 176410105Sadam.leventhal@sun.com 176510105Sadam.leventhal@sun.com orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); 176610105Sadam.leventhal@sun.com 176710105Sadam.leventhal@sun.com current = 0; 176810105Sadam.leventhal@sun.com next = tgts[current]; 176910105Sadam.leventhal@sun.com 177010105Sadam.leventhal@sun.com while (current != n) { 177110105Sadam.leventhal@sun.com tgts[current] = next; 177210105Sadam.leventhal@sun.com current = 0; 177310105Sadam.leventhal@sun.com 177410105Sadam.leventhal@sun.com /* 177510105Sadam.leventhal@sun.com * Save off the original data that we're going to 177610105Sadam.leventhal@sun.com * attempt to reconstruct. 177710105Sadam.leventhal@sun.com */ 177810105Sadam.leventhal@sun.com for (i = 0; i < n; i++) { 177910105Sadam.leventhal@sun.com ASSERT(orig[i] != NULL); 178010105Sadam.leventhal@sun.com c = tgts[i]; 178110105Sadam.leventhal@sun.com ASSERT3S(c, >=, 0); 178210105Sadam.leventhal@sun.com ASSERT3S(c, <, rm->rm_cols); 178310105Sadam.leventhal@sun.com rc = &rm->rm_col[c]; 178410105Sadam.leventhal@sun.com bcopy(rc->rc_data, orig[i], rc->rc_size); 178510105Sadam.leventhal@sun.com } 178610105Sadam.leventhal@sun.com 178710105Sadam.leventhal@sun.com /* 178810105Sadam.leventhal@sun.com * Attempt a reconstruction and exit the outer loop on 178910105Sadam.leventhal@sun.com * success. 179010105Sadam.leventhal@sun.com */ 179110105Sadam.leventhal@sun.com code = vdev_raidz_reconstruct(rm, tgts, n); 179210614SJonathan.Adams@Sun.COM if (raidz_checksum_verify(zio) == 0) { 179310105Sadam.leventhal@sun.com atomic_inc_64(&raidz_corrected[code]); 179410105Sadam.leventhal@sun.com 179510105Sadam.leventhal@sun.com for (i = 0; i < n; i++) { 179610105Sadam.leventhal@sun.com c = tgts[i]; 179710105Sadam.leventhal@sun.com rc = &rm->rm_col[c]; 179810105Sadam.leventhal@sun.com ASSERT(rc->rc_error == 0); 179910614SJonathan.Adams@Sun.COM if (rc->rc_tried) 180010614SJonathan.Adams@Sun.COM raidz_checksum_error(zio, rc, 180110614SJonathan.Adams@Sun.COM orig[i]); 180210105Sadam.leventhal@sun.com rc->rc_error = ECKSUM; 180310105Sadam.leventhal@sun.com } 180410105Sadam.leventhal@sun.com 180510105Sadam.leventhal@sun.com ret = code; 180610105Sadam.leventhal@sun.com goto done; 180710105Sadam.leventhal@sun.com } 180810105Sadam.leventhal@sun.com 180910105Sadam.leventhal@sun.com /* 181010105Sadam.leventhal@sun.com * Restore the original data. 181110105Sadam.leventhal@sun.com */ 181210105Sadam.leventhal@sun.com for (i = 0; i < n; i++) { 181310105Sadam.leventhal@sun.com c = tgts[i]; 181410105Sadam.leventhal@sun.com rc = &rm->rm_col[c]; 181510105Sadam.leventhal@sun.com bcopy(orig[i], rc->rc_data, rc->rc_size); 181610105Sadam.leventhal@sun.com } 181710105Sadam.leventhal@sun.com 181810105Sadam.leventhal@sun.com do { 181910105Sadam.leventhal@sun.com /* 182010105Sadam.leventhal@sun.com * Find the next valid column after the current 182110105Sadam.leventhal@sun.com * position.. 182210105Sadam.leventhal@sun.com */ 182310105Sadam.leventhal@sun.com for (next = tgts[current] + 1; 182410105Sadam.leventhal@sun.com next < rm->rm_cols && 182510105Sadam.leventhal@sun.com rm->rm_col[next].rc_error != 0; next++) 182610105Sadam.leventhal@sun.com continue; 182710105Sadam.leventhal@sun.com 182810105Sadam.leventhal@sun.com ASSERT(next <= tgts[current + 1]); 182910105Sadam.leventhal@sun.com 183010105Sadam.leventhal@sun.com /* 183110105Sadam.leventhal@sun.com * If that spot is available, we're done here. 183210105Sadam.leventhal@sun.com */ 183310105Sadam.leventhal@sun.com if (next != tgts[current + 1]) 183410105Sadam.leventhal@sun.com break; 183510105Sadam.leventhal@sun.com 183610105Sadam.leventhal@sun.com /* 183710105Sadam.leventhal@sun.com * Otherwise, find the next valid column after 183810105Sadam.leventhal@sun.com * the previous position. 183910105Sadam.leventhal@sun.com */ 184010105Sadam.leventhal@sun.com for (c = tgts[current - 1] + 1; 184110105Sadam.leventhal@sun.com rm->rm_col[c].rc_error != 0; c++) 184210105Sadam.leventhal@sun.com continue; 184310105Sadam.leventhal@sun.com 184410105Sadam.leventhal@sun.com tgts[current] = c; 184510105Sadam.leventhal@sun.com current++; 184610105Sadam.leventhal@sun.com 184710105Sadam.leventhal@sun.com } while (current != n); 184810105Sadam.leventhal@sun.com } 184910105Sadam.leventhal@sun.com } 185010105Sadam.leventhal@sun.com n--; 185110105Sadam.leventhal@sun.com done: 185210105Sadam.leventhal@sun.com for (i = 0; i < n; i++) { 185310105Sadam.leventhal@sun.com zio_buf_free(orig[i], rm->rm_col[0].rc_size); 185410105Sadam.leventhal@sun.com } 185510105Sadam.leventhal@sun.com 185610105Sadam.leventhal@sun.com return (ret); 185710105Sadam.leventhal@sun.com } 185810105Sadam.leventhal@sun.com 18597754SJeff.Bonwick@Sun.COM static void 1860789Sahrens vdev_raidz_io_done(zio_t *zio) 1861789Sahrens { 1862789Sahrens vdev_t *vd = zio->io_vd; 1863789Sahrens vdev_t *cvd; 1864789Sahrens raidz_map_t *rm = zio->io_vsd; 186510105Sadam.leventhal@sun.com raidz_col_t *rc; 1866789Sahrens int unexpected_errors = 0; 18672082Seschrock int parity_errors = 0; 18683456Sahl int parity_untried = 0; 18692082Seschrock int data_errors = 0; 18707754SJeff.Bonwick@Sun.COM int total_errors = 0; 187110105Sadam.leventhal@sun.com int n, c; 187210105Sadam.leventhal@sun.com int tgts[VDEV_RAIDZ_MAXPARITY]; 187310105Sadam.leventhal@sun.com int code; 1874789Sahrens 18751775Sbillm ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 1876789Sahrens 18772082Seschrock ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 18782082Seschrock ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 18792082Seschrock 1880789Sahrens for (c = 0; c < rm->rm_cols; c++) { 1881789Sahrens rc = &rm->rm_col[c]; 1882789Sahrens 1883789Sahrens if (rc->rc_error) { 18847754SJeff.Bonwick@Sun.COM ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 18852082Seschrock 18862082Seschrock if (c < rm->rm_firstdatacol) 18872082Seschrock parity_errors++; 18882082Seschrock else 18892082Seschrock data_errors++; 18902082Seschrock 1891789Sahrens if (!rc->rc_skipped) 1892789Sahrens unexpected_errors++; 18932082Seschrock 18947754SJeff.Bonwick@Sun.COM total_errors++; 18953456Sahl } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 18963456Sahl parity_untried++; 1897789Sahrens } 1898789Sahrens } 1899789Sahrens 1900789Sahrens if (zio->io_type == ZIO_TYPE_WRITE) { 1901789Sahrens /* 19027754SJeff.Bonwick@Sun.COM * XXX -- for now, treat partial writes as a success. 19037754SJeff.Bonwick@Sun.COM * (If we couldn't write enough columns to reconstruct 19047754SJeff.Bonwick@Sun.COM * the data, the I/O failed. Otherwise, good enough.) 19057754SJeff.Bonwick@Sun.COM * 19067754SJeff.Bonwick@Sun.COM * Now that we support write reallocation, it would be better 19077754SJeff.Bonwick@Sun.COM * to treat partial failure as real failure unless there are 19087754SJeff.Bonwick@Sun.COM * no non-degraded top-level vdevs left, and not update DTLs 19097754SJeff.Bonwick@Sun.COM * if we intend to reallocate. 1910789Sahrens */ 1911789Sahrens /* XXPOLICY */ 19127754SJeff.Bonwick@Sun.COM if (total_errors > rm->rm_firstdatacol) 19137754SJeff.Bonwick@Sun.COM zio->io_error = vdev_raidz_worst_error(rm); 1914789Sahrens 19157754SJeff.Bonwick@Sun.COM return; 1916789Sahrens } 1917789Sahrens 1918789Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ); 19192082Seschrock /* 19202082Seschrock * There are three potential phases for a read: 19212082Seschrock * 1. produce valid data from the columns read 19222082Seschrock * 2. read all disks and try again 19232082Seschrock * 3. perform combinatorial reconstruction 19242082Seschrock * 19252082Seschrock * Each phase is progressively both more expensive and less likely to 19262082Seschrock * occur. If we encounter more errors than we can repair or all phases 19272082Seschrock * fail, we have no choice but to return an error. 19282082Seschrock */ 1929789Sahrens 1930789Sahrens /* 19312082Seschrock * If the number of errors we saw was correctable -- less than or equal 19323456Sahl * to the number of parity disks read -- attempt to produce data that 19333456Sahl * has a valid checksum. Naturally, this case applies in the absence of 19343456Sahl * any errors. 1935789Sahrens */ 19367754SJeff.Bonwick@Sun.COM if (total_errors <= rm->rm_firstdatacol - parity_untried) { 193710105Sadam.leventhal@sun.com if (data_errors == 0) { 193810614SJonathan.Adams@Sun.COM if (raidz_checksum_verify(zio) == 0) { 19394034Sahl /* 19404034Sahl * If we read parity information (unnecessarily 19414034Sahl * as it happens since no reconstruction was 19424034Sahl * needed) regenerate and verify the parity. 19434034Sahl * We also regenerate parity when resilvering 19444034Sahl * so we can write it out to the failed device 19454034Sahl * later. 19464034Sahl */ 19473456Sahl if (parity_errors + parity_untried < 19484034Sahl rm->rm_firstdatacol || 19494034Sahl (zio->io_flags & ZIO_FLAG_RESILVER)) { 19503456Sahl n = raidz_parity_verify(zio, rm); 19513456Sahl unexpected_errors += n; 19523456Sahl ASSERT(parity_errors + n <= 19533456Sahl rm->rm_firstdatacol); 19543456Sahl } 19552082Seschrock goto done; 19562082Seschrock } 195710105Sadam.leventhal@sun.com } else { 19583456Sahl /* 19593456Sahl * We either attempt to read all the parity columns or 19603456Sahl * none of them. If we didn't try to read parity, we 19613456Sahl * wouldn't be here in the correctable case. There must 19623456Sahl * also have been fewer parity errors than parity 19633456Sahl * columns or, again, we wouldn't be in this code path. 19643456Sahl */ 19653456Sahl ASSERT(parity_untried == 0); 19662082Seschrock ASSERT(parity_errors < rm->rm_firstdatacol); 19672082Seschrock 19682082Seschrock /* 196910105Sadam.leventhal@sun.com * Identify the data columns that reported an error. 19702082Seschrock */ 197110105Sadam.leventhal@sun.com n = 0; 19722082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 19732082Seschrock rc = &rm->rm_col[c]; 197410105Sadam.leventhal@sun.com if (rc->rc_error != 0) { 197510105Sadam.leventhal@sun.com ASSERT(n < VDEV_RAIDZ_MAXPARITY); 197610105Sadam.leventhal@sun.com tgts[n++] = c; 197710105Sadam.leventhal@sun.com } 19782082Seschrock } 19792082Seschrock 198010105Sadam.leventhal@sun.com ASSERT(rm->rm_firstdatacol >= n); 198110105Sadam.leventhal@sun.com 198210105Sadam.leventhal@sun.com code = vdev_raidz_reconstruct(rm, tgts, n); 19832082Seschrock 198410614SJonathan.Adams@Sun.COM if (raidz_checksum_verify(zio) == 0) { 198510105Sadam.leventhal@sun.com atomic_inc_64(&raidz_corrected[code]); 1986789Sahrens 19872082Seschrock /* 198810105Sadam.leventhal@sun.com * If we read more parity disks than were used 198910105Sadam.leventhal@sun.com * for reconstruction, confirm that the other 199010105Sadam.leventhal@sun.com * parity disks produced correct data. This 199110105Sadam.leventhal@sun.com * routine is suboptimal in that it regenerates 199210105Sadam.leventhal@sun.com * the parity that we already used in addition 199310105Sadam.leventhal@sun.com * to the parity that we're attempting to 199410105Sadam.leventhal@sun.com * verify, but this should be a relatively 199510105Sadam.leventhal@sun.com * uncommon case, and can be optimized if it 199610105Sadam.leventhal@sun.com * becomes a problem. Note that we regenerate 199710105Sadam.leventhal@sun.com * parity when resilvering so we can write it 199810105Sadam.leventhal@sun.com * out to failed devices later. 19992082Seschrock */ 200010105Sadam.leventhal@sun.com if (parity_errors < rm->rm_firstdatacol - n || 20014034Sahl (zio->io_flags & ZIO_FLAG_RESILVER)) { 20022082Seschrock n = raidz_parity_verify(zio, rm); 20032082Seschrock unexpected_errors += n; 20042082Seschrock ASSERT(parity_errors + n <= 20052082Seschrock rm->rm_firstdatacol); 20062082Seschrock } 20072082Seschrock 20082082Seschrock goto done; 20092082Seschrock } 2010789Sahrens } 2011789Sahrens } 2012789Sahrens 2013789Sahrens /* 20142082Seschrock * This isn't a typical situation -- either we got a read error or 20152082Seschrock * a child silently returned bad data. Read every block so we can 20162082Seschrock * try again with as much data and parity as we can track down. If 20172082Seschrock * we've already been through once before, all children will be marked 20182082Seschrock * as tried so we'll proceed to combinatorial reconstruction. 2019789Sahrens */ 2020789Sahrens unexpected_errors = 1; 20212082Seschrock rm->rm_missingdata = 0; 20222082Seschrock rm->rm_missingparity = 0; 2023789Sahrens 20242082Seschrock for (c = 0; c < rm->rm_cols; c++) { 20252082Seschrock if (rm->rm_col[c].rc_tried) 20262082Seschrock continue; 2027789Sahrens 2028789Sahrens zio_vdev_io_redone(zio); 20292082Seschrock do { 2030789Sahrens rc = &rm->rm_col[c]; 2031789Sahrens if (rc->rc_tried) 2032789Sahrens continue; 2033789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, 20342082Seschrock vd->vdev_child[rc->rc_devidx], 2035789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 20367754SJeff.Bonwick@Sun.COM zio->io_type, zio->io_priority, 0, 2037789Sahrens vdev_raidz_child_done, rc)); 20382082Seschrock } while (++c < rm->rm_cols); 20395530Sbonwick 20407754SJeff.Bonwick@Sun.COM return; 2041789Sahrens } 2042789Sahrens 2043789Sahrens /* 20442082Seschrock * At this point we've attempted to reconstruct the data given the 20452082Seschrock * errors we detected, and we've attempted to read all columns. There 20462082Seschrock * must, therefore, be one or more additional problems -- silent errors 20472082Seschrock * resulting in invalid data rather than explicit I/O errors resulting 204810105Sadam.leventhal@sun.com * in absent data. We check if there is enough additional data to 204910105Sadam.leventhal@sun.com * possibly reconstruct the data and then perform combinatorial 205010105Sadam.leventhal@sun.com * reconstruction over all possible combinations. If that fails, 205110105Sadam.leventhal@sun.com * we're cooked. 2052789Sahrens */ 205310614SJonathan.Adams@Sun.COM if (total_errors > rm->rm_firstdatacol) { 20547754SJeff.Bonwick@Sun.COM zio->io_error = vdev_raidz_worst_error(rm); 20552082Seschrock 205610614SJonathan.Adams@Sun.COM } else if (total_errors < rm->rm_firstdatacol && 205710614SJonathan.Adams@Sun.COM (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { 20582082Seschrock /* 205910105Sadam.leventhal@sun.com * If we didn't use all the available parity for the 206010105Sadam.leventhal@sun.com * combinatorial reconstruction, verify that the remaining 206110105Sadam.leventhal@sun.com * parity is correct. 20622082Seschrock */ 206310105Sadam.leventhal@sun.com if (code != (1 << rm->rm_firstdatacol) - 1) 206410105Sadam.leventhal@sun.com (void) raidz_parity_verify(zio, rm); 206510105Sadam.leventhal@sun.com } else { 206610105Sadam.leventhal@sun.com /* 206710614SJonathan.Adams@Sun.COM * We're here because either: 206810614SJonathan.Adams@Sun.COM * 206910614SJonathan.Adams@Sun.COM * total_errors == rm_first_datacol, or 207010614SJonathan.Adams@Sun.COM * vdev_raidz_combrec() failed 207110614SJonathan.Adams@Sun.COM * 207210614SJonathan.Adams@Sun.COM * In either case, there is enough bad data to prevent 207310614SJonathan.Adams@Sun.COM * reconstruction. 207410614SJonathan.Adams@Sun.COM * 207510614SJonathan.Adams@Sun.COM * Start checksum ereports for all children which haven't 207611670SNeil.Perrin@Sun.COM * failed, and the IO wasn't speculative. 207710105Sadam.leventhal@sun.com */ 207810105Sadam.leventhal@sun.com zio->io_error = ECKSUM; 20792082Seschrock 208011670SNeil.Perrin@Sun.COM if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 208111670SNeil.Perrin@Sun.COM for (c = 0; c < rm->rm_cols; c++) { 208211670SNeil.Perrin@Sun.COM rc = &rm->rm_col[c]; 208311670SNeil.Perrin@Sun.COM if (rc->rc_error == 0) { 208411670SNeil.Perrin@Sun.COM zio_bad_cksum_t zbc; 208511670SNeil.Perrin@Sun.COM zbc.zbc_has_cksum = 0; 208611670SNeil.Perrin@Sun.COM zbc.zbc_injected = 208711670SNeil.Perrin@Sun.COM rm->rm_ecksuminjected; 208810614SJonathan.Adams@Sun.COM 208911670SNeil.Perrin@Sun.COM zfs_ereport_start_checksum( 209011670SNeil.Perrin@Sun.COM zio->io_spa, 209111670SNeil.Perrin@Sun.COM vd->vdev_child[rc->rc_devidx], 209211670SNeil.Perrin@Sun.COM zio, rc->rc_offset, rc->rc_size, 209311670SNeil.Perrin@Sun.COM (void *)(uintptr_t)c, &zbc); 209411670SNeil.Perrin@Sun.COM } 20952082Seschrock } 20961544Seschrock } 20971544Seschrock } 2098789Sahrens 2099789Sahrens done: 2100789Sahrens zio_checksum_verified(zio); 2101789Sahrens 21028241SJeff.Bonwick@Sun.COM if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2103789Sahrens (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2104789Sahrens /* 2105789Sahrens * Use the good data we have in hand to repair damaged children. 2106789Sahrens */ 2107789Sahrens for (c = 0; c < rm->rm_cols; c++) { 2108789Sahrens rc = &rm->rm_col[c]; 21092082Seschrock cvd = vd->vdev_child[rc->rc_devidx]; 2110789Sahrens 21111732Sbonwick if (rc->rc_error == 0) 21121732Sbonwick continue; 21131732Sbonwick 21147754SJeff.Bonwick@Sun.COM zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 21151732Sbonwick rc->rc_offset, rc->rc_data, rc->rc_size, 21161732Sbonwick ZIO_TYPE_WRITE, zio->io_priority, 21178241SJeff.Bonwick@Sun.COM ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 21188241SJeff.Bonwick@Sun.COM ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 21191732Sbonwick } 2120789Sahrens } 2121789Sahrens } 2122789Sahrens 2123789Sahrens static void 2124789Sahrens vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2125789Sahrens { 21262082Seschrock if (faulted > vd->vdev_nparity) 21271544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 21281544Seschrock VDEV_AUX_NO_REPLICAS); 2129789Sahrens else if (degraded + faulted != 0) 21301544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2131789Sahrens else 21321544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2133789Sahrens } 2134789Sahrens 2135789Sahrens vdev_ops_t vdev_raidz_ops = { 2136789Sahrens vdev_raidz_open, 2137789Sahrens vdev_raidz_close, 2138789Sahrens vdev_raidz_asize, 2139789Sahrens vdev_raidz_io_start, 2140789Sahrens vdev_raidz_io_done, 2141789Sahrens vdev_raidz_state_change, 2142*11958SGeorge.Wilson@Sun.COM NULL, 2143*11958SGeorge.Wilson@Sun.COM NULL, 2144789Sahrens VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2145789Sahrens B_FALSE /* not a leaf vdev */ 2146789Sahrens }; 2147