1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 212082Seschrock 22789Sahrens /* 239434SMark.Musante@Sun.COM * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24789Sahrens * Use is subject to license terms. 25789Sahrens */ 26789Sahrens 27789Sahrens #include <sys/zfs_context.h> 28789Sahrens #include <sys/spa.h> 29789Sahrens #include <sys/vdev_impl.h> 30789Sahrens #include <sys/zio.h> 31789Sahrens #include <sys/zio_checksum.h> 32789Sahrens #include <sys/fs/zfs.h> 331544Seschrock #include <sys/fm/fs/zfs.h> 34789Sahrens 35789Sahrens /* 36789Sahrens * Virtual device vector for RAID-Z. 372082Seschrock * 382082Seschrock * This vdev supports both single and double parity. For single parity, we 392082Seschrock * use a simple XOR of all the data columns. For double parity, we use both 402082Seschrock * the simple XOR as well as a technique described in "The mathematics of 412082Seschrock * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8), 422082Seschrock * over the integers expressable in a single byte. Briefly, the operations on 432082Seschrock * the field are defined as follows: 442082Seschrock * 452082Seschrock * o addition (+) is represented by a bitwise XOR 462082Seschrock * o subtraction (-) is therefore identical to addition: A + B = A - B 472082Seschrock * o multiplication of A by 2 is defined by the following bitwise expression: 482082Seschrock * (A * 2)_7 = A_6 492082Seschrock * (A * 2)_6 = A_5 502082Seschrock * (A * 2)_5 = A_4 512082Seschrock * (A * 2)_4 = A_3 + A_7 522082Seschrock * (A * 2)_3 = A_2 + A_7 532082Seschrock * (A * 2)_2 = A_1 + A_7 542082Seschrock * (A * 2)_1 = A_0 552082Seschrock * (A * 2)_0 = A_7 562082Seschrock * 572082Seschrock * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 582082Seschrock * 592082Seschrock * Observe that any number in the field (except for 0) can be expressed as a 602082Seschrock * power of 2 -- a generator for the field. We store a table of the powers of 612082Seschrock * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 622082Seschrock * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 632082Seschrock * than field addition). The inverse of a field element A (A^-1) is A^254. 642082Seschrock * 652082Seschrock * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1, 662082Seschrock * can be expressed by field operations: 672082Seschrock * 682082Seschrock * P = D_0 + D_1 + ... + D_n-2 + D_n-1 692082Seschrock * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 702082Seschrock * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 712082Seschrock * 722082Seschrock * See the reconstruction code below for how P and Q can used individually or 732082Seschrock * in concert to recover missing data columns. 74789Sahrens */ 75789Sahrens 76789Sahrens typedef struct raidz_col { 772082Seschrock uint64_t rc_devidx; /* child device index for I/O */ 782082Seschrock uint64_t rc_offset; /* device offset */ 792082Seschrock uint64_t rc_size; /* I/O size */ 802082Seschrock void *rc_data; /* I/O data */ 812082Seschrock int rc_error; /* I/O error for this device */ 822082Seschrock uint8_t rc_tried; /* Did we attempt this I/O column? */ 832082Seschrock uint8_t rc_skipped; /* Did we skip this I/O column? */ 84789Sahrens } raidz_col_t; 85789Sahrens 86789Sahrens typedef struct raidz_map { 872082Seschrock uint64_t rm_cols; /* Column count */ 882082Seschrock uint64_t rm_bigcols; /* Number of oversized columns */ 892082Seschrock uint64_t rm_asize; /* Actual total I/O size */ 902082Seschrock uint64_t rm_missingdata; /* Count of missing data devices */ 912082Seschrock uint64_t rm_missingparity; /* Count of missing parity devices */ 922082Seschrock uint64_t rm_firstdatacol; /* First data column/parity count */ 932082Seschrock raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 94789Sahrens } raidz_map_t; 95789Sahrens 962082Seschrock #define VDEV_RAIDZ_P 0 972082Seschrock #define VDEV_RAIDZ_Q 1 982082Seschrock 992082Seschrock #define VDEV_RAIDZ_MAXPARITY 2 1002082Seschrock 1012082Seschrock #define VDEV_RAIDZ_MUL_2(a) (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0)) 1022082Seschrock 1032082Seschrock /* 1042082Seschrock * These two tables represent powers and logs of 2 in the Galois field defined 1052082Seschrock * above. These values were computed by repeatedly multiplying by 2 as above. 1062082Seschrock */ 1072082Seschrock static const uint8_t vdev_raidz_pow2[256] = { 1082082Seschrock 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 1092082Seschrock 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 1102082Seschrock 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 1112082Seschrock 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 1122082Seschrock 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 1132082Seschrock 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 1142082Seschrock 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 1152082Seschrock 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 1162082Seschrock 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 1172082Seschrock 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 1182082Seschrock 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 1192082Seschrock 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 1202082Seschrock 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 1212082Seschrock 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 1222082Seschrock 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 1232082Seschrock 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 1242082Seschrock 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 1252082Seschrock 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 1262082Seschrock 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 1272082Seschrock 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 1282082Seschrock 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 1292082Seschrock 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 1302082Seschrock 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 1312082Seschrock 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 1322082Seschrock 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 1332082Seschrock 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 1342082Seschrock 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 1352082Seschrock 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 1362082Seschrock 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 1372082Seschrock 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 1382082Seschrock 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 1392082Seschrock 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 1402082Seschrock }; 1412082Seschrock static const uint8_t vdev_raidz_log2[256] = { 1422082Seschrock 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 1432082Seschrock 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 1442082Seschrock 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 1452082Seschrock 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 1462082Seschrock 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 1472082Seschrock 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 1482082Seschrock 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 1492082Seschrock 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 1502082Seschrock 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 1512082Seschrock 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 1522082Seschrock 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 1532082Seschrock 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 1542082Seschrock 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 1552082Seschrock 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 1562082Seschrock 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 1572082Seschrock 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 1582082Seschrock 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 1592082Seschrock 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 1602082Seschrock 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 1612082Seschrock 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 1622082Seschrock 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 1632082Seschrock 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 1642082Seschrock 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 1652082Seschrock 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 1662082Seschrock 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 1672082Seschrock 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 1682082Seschrock 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 1692082Seschrock 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 1702082Seschrock 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 1712082Seschrock 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 1722082Seschrock 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 1732082Seschrock 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 1742082Seschrock }; 1752082Seschrock 1762082Seschrock /* 1772082Seschrock * Multiply a given number by 2 raised to the given power. 1782082Seschrock */ 1792082Seschrock static uint8_t 1802082Seschrock vdev_raidz_exp2(uint_t a, int exp) 1812082Seschrock { 1822082Seschrock if (a == 0) 1832082Seschrock return (0); 1842082Seschrock 1852082Seschrock ASSERT(exp >= 0); 1862082Seschrock ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 1872082Seschrock 1882082Seschrock exp += vdev_raidz_log2[a]; 1892082Seschrock if (exp > 255) 1902082Seschrock exp -= 255; 1912082Seschrock 1922082Seschrock return (vdev_raidz_pow2[exp]); 1932082Seschrock } 1942082Seschrock 1957754SJeff.Bonwick@Sun.COM static void 1967754SJeff.Bonwick@Sun.COM vdev_raidz_map_free(zio_t *zio) 1977754SJeff.Bonwick@Sun.COM { 1987754SJeff.Bonwick@Sun.COM raidz_map_t *rm = zio->io_vsd; 1997754SJeff.Bonwick@Sun.COM int c; 2007754SJeff.Bonwick@Sun.COM 2017754SJeff.Bonwick@Sun.COM for (c = 0; c < rm->rm_firstdatacol; c++) 2027754SJeff.Bonwick@Sun.COM zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 2037754SJeff.Bonwick@Sun.COM 2047754SJeff.Bonwick@Sun.COM kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols])); 2057754SJeff.Bonwick@Sun.COM } 2067754SJeff.Bonwick@Sun.COM 207789Sahrens static raidz_map_t * 2082082Seschrock vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, 2092082Seschrock uint64_t nparity) 210789Sahrens { 211789Sahrens raidz_map_t *rm; 212789Sahrens uint64_t b = zio->io_offset >> unit_shift; 213789Sahrens uint64_t s = zio->io_size >> unit_shift; 214789Sahrens uint64_t f = b % dcols; 215789Sahrens uint64_t o = (b / dcols) << unit_shift; 2162082Seschrock uint64_t q, r, c, bc, col, acols, coff, devidx; 217789Sahrens 2182082Seschrock q = s / (dcols - nparity); 2192082Seschrock r = s - q * (dcols - nparity); 2202082Seschrock bc = (r == 0 ? 0 : r + nparity); 221789Sahrens 222789Sahrens acols = (q == 0 ? bc : dcols); 223789Sahrens 224789Sahrens rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP); 225789Sahrens 226789Sahrens rm->rm_cols = acols; 227789Sahrens rm->rm_bigcols = bc; 228789Sahrens rm->rm_asize = 0; 2292082Seschrock rm->rm_missingdata = 0; 2302082Seschrock rm->rm_missingparity = 0; 2312082Seschrock rm->rm_firstdatacol = nparity; 232789Sahrens 233789Sahrens for (c = 0; c < acols; c++) { 234789Sahrens col = f + c; 235789Sahrens coff = o; 236789Sahrens if (col >= dcols) { 237789Sahrens col -= dcols; 238789Sahrens coff += 1ULL << unit_shift; 239789Sahrens } 2402082Seschrock rm->rm_col[c].rc_devidx = col; 241789Sahrens rm->rm_col[c].rc_offset = coff; 242789Sahrens rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift; 243789Sahrens rm->rm_col[c].rc_data = NULL; 244789Sahrens rm->rm_col[c].rc_error = 0; 245789Sahrens rm->rm_col[c].rc_tried = 0; 246789Sahrens rm->rm_col[c].rc_skipped = 0; 247789Sahrens rm->rm_asize += rm->rm_col[c].rc_size; 248789Sahrens } 249789Sahrens 2502082Seschrock rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift); 251789Sahrens 252789Sahrens for (c = 0; c < rm->rm_firstdatacol; c++) 253789Sahrens rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); 254789Sahrens 255789Sahrens rm->rm_col[c].rc_data = zio->io_data; 256789Sahrens 257789Sahrens for (c = c + 1; c < acols; c++) 258789Sahrens rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 259789Sahrens rm->rm_col[c - 1].rc_size; 260789Sahrens 2611133Seschrock /* 2622082Seschrock * If all data stored spans all columns, there's a danger that parity 2632082Seschrock * will always be on the same device and, since parity isn't read 2642082Seschrock * during normal operation, that that device's I/O bandwidth won't be 2652082Seschrock * used effectively. We therefore switch the parity every 1MB. 2662082Seschrock * 2672082Seschrock * ... at least that was, ostensibly, the theory. As a practical 2682082Seschrock * matter unless we juggle the parity between all devices evenly, we 2692082Seschrock * won't see any benefit. Further, occasional writes that aren't a 2702082Seschrock * multiple of the LCM of the number of children and the minimum 2712082Seschrock * stripe width are sufficient to avoid pessimal behavior. 2722082Seschrock * Unfortunately, this decision created an implicit on-disk format 2733456Sahl * requirement that we need to support for all eternity, but only 2743456Sahl * for single-parity RAID-Z. 2751133Seschrock */ 2761133Seschrock ASSERT(rm->rm_cols >= 2); 2771133Seschrock ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 278789Sahrens 2792082Seschrock if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 2802082Seschrock devidx = rm->rm_col[0].rc_devidx; 2811133Seschrock o = rm->rm_col[0].rc_offset; 2822082Seschrock rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 2831133Seschrock rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 2842082Seschrock rm->rm_col[1].rc_devidx = devidx; 2851133Seschrock rm->rm_col[1].rc_offset = o; 286789Sahrens } 287789Sahrens 288789Sahrens zio->io_vsd = rm; 2897754SJeff.Bonwick@Sun.COM zio->io_vsd_free = vdev_raidz_map_free; 290789Sahrens return (rm); 291789Sahrens } 292789Sahrens 293789Sahrens static void 2942082Seschrock vdev_raidz_generate_parity_p(raidz_map_t *rm) 2952082Seschrock { 2962082Seschrock uint64_t *p, *src, pcount, ccount, i; 2972082Seschrock int c; 2982082Seschrock 2992082Seschrock pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 3002082Seschrock 3012082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 3022082Seschrock src = rm->rm_col[c].rc_data; 3032082Seschrock p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 3042082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 3052082Seschrock 3062082Seschrock if (c == rm->rm_firstdatacol) { 3072082Seschrock ASSERT(ccount == pcount); 3082082Seschrock for (i = 0; i < ccount; i++, p++, src++) { 3092082Seschrock *p = *src; 3102082Seschrock } 3112082Seschrock } else { 3122082Seschrock ASSERT(ccount <= pcount); 3132082Seschrock for (i = 0; i < ccount; i++, p++, src++) { 3142082Seschrock *p ^= *src; 3152082Seschrock } 3162082Seschrock } 3172082Seschrock } 3182082Seschrock } 3192082Seschrock 3202082Seschrock static void 3212082Seschrock vdev_raidz_generate_parity_pq(raidz_map_t *rm) 322789Sahrens { 3232082Seschrock uint64_t *q, *p, *src, pcount, ccount, mask, i; 3242082Seschrock int c; 3252082Seschrock 3262082Seschrock pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 3272082Seschrock ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 3282082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_size); 3292082Seschrock 3302082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 3312082Seschrock src = rm->rm_col[c].rc_data; 3322082Seschrock p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 3332082Seschrock q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 3342082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 3352082Seschrock 3362082Seschrock if (c == rm->rm_firstdatacol) { 3372082Seschrock ASSERT(ccount == pcount || ccount == 0); 3382082Seschrock for (i = 0; i < ccount; i++, p++, q++, src++) { 3392082Seschrock *q = *src; 3402082Seschrock *p = *src; 3412082Seschrock } 3422082Seschrock for (; i < pcount; i++, p++, q++, src++) { 3432082Seschrock *q = 0; 3442082Seschrock *p = 0; 3452082Seschrock } 3462082Seschrock } else { 3472082Seschrock ASSERT(ccount <= pcount); 348789Sahrens 3492082Seschrock /* 3502082Seschrock * Rather than multiplying each byte individually (as 3512082Seschrock * described above), we are able to handle 8 at once 3522082Seschrock * by generating a mask based on the high bit in each 3532082Seschrock * byte and using that to conditionally XOR in 0x1d. 3542082Seschrock */ 3552082Seschrock for (i = 0; i < ccount; i++, p++, q++, src++) { 3562082Seschrock mask = *q & 0x8080808080808080ULL; 3572082Seschrock mask = (mask << 1) - (mask >> 7); 3582082Seschrock *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ 3592082Seschrock (mask & 0x1d1d1d1d1d1d1d1dULL); 3602082Seschrock *q ^= *src; 3612082Seschrock *p ^= *src; 3622082Seschrock } 3632082Seschrock 3642082Seschrock /* 3652082Seschrock * Treat short columns as though they are full of 0s. 3662082Seschrock */ 3672082Seschrock for (; i < pcount; i++, q++) { 3682082Seschrock mask = *q & 0x8080808080808080ULL; 3692082Seschrock mask = (mask << 1) - (mask >> 7); 3702082Seschrock *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ 3712082Seschrock (mask & 0x1d1d1d1d1d1d1d1dULL); 3722082Seschrock } 3732082Seschrock } 3742082Seschrock } 3752082Seschrock } 3762082Seschrock 3772082Seschrock static void 3782082Seschrock vdev_raidz_reconstruct_p(raidz_map_t *rm, int x) 3792082Seschrock { 3802082Seschrock uint64_t *dst, *src, xcount, ccount, count, i; 3812082Seschrock int c; 3822082Seschrock 3832082Seschrock xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 3842082Seschrock ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 3852082Seschrock ASSERT(xcount > 0); 3862082Seschrock 3872082Seschrock src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 3882082Seschrock dst = rm->rm_col[x].rc_data; 3892082Seschrock for (i = 0; i < xcount; i++, dst++, src++) { 3902082Seschrock *dst = *src; 3912082Seschrock } 3922082Seschrock 3932082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 394789Sahrens src = rm->rm_col[c].rc_data; 395789Sahrens dst = rm->rm_col[x].rc_data; 3962082Seschrock 3972082Seschrock if (c == x) 3982082Seschrock continue; 3992082Seschrock 4002082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 4012082Seschrock count = MIN(ccount, xcount); 4022082Seschrock 4032082Seschrock for (i = 0; i < count; i++, dst++, src++) { 4042082Seschrock *dst ^= *src; 405789Sahrens } 406789Sahrens } 407789Sahrens } 408789Sahrens 4092082Seschrock static void 4102082Seschrock vdev_raidz_reconstruct_q(raidz_map_t *rm, int x) 4112082Seschrock { 4122082Seschrock uint64_t *dst, *src, xcount, ccount, count, mask, i; 4132082Seschrock uint8_t *b; 4142082Seschrock int c, j, exp; 4152082Seschrock 4162082Seschrock xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 4172082Seschrock ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 4182082Seschrock 4192082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 4202082Seschrock src = rm->rm_col[c].rc_data; 4212082Seschrock dst = rm->rm_col[x].rc_data; 4222082Seschrock 4232082Seschrock if (c == x) 4242082Seschrock ccount = 0; 4252082Seschrock else 4262082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 4272082Seschrock 4282082Seschrock count = MIN(ccount, xcount); 4292082Seschrock 4302082Seschrock if (c == rm->rm_firstdatacol) { 4312082Seschrock for (i = 0; i < count; i++, dst++, src++) { 4322082Seschrock *dst = *src; 4332082Seschrock } 4342082Seschrock for (; i < xcount; i++, dst++) { 4352082Seschrock *dst = 0; 4362082Seschrock } 4372082Seschrock 4382082Seschrock } else { 4392082Seschrock /* 4402082Seschrock * For an explanation of this, see the comment in 4412082Seschrock * vdev_raidz_generate_parity_pq() above. 4422082Seschrock */ 4432082Seschrock for (i = 0; i < count; i++, dst++, src++) { 4442082Seschrock mask = *dst & 0x8080808080808080ULL; 4452082Seschrock mask = (mask << 1) - (mask >> 7); 4462082Seschrock *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ 4472082Seschrock (mask & 0x1d1d1d1d1d1d1d1dULL); 4482082Seschrock *dst ^= *src; 4492082Seschrock } 4502082Seschrock 4512082Seschrock for (; i < xcount; i++, dst++) { 4522082Seschrock mask = *dst & 0x8080808080808080ULL; 4532082Seschrock mask = (mask << 1) - (mask >> 7); 4542082Seschrock *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ 4552082Seschrock (mask & 0x1d1d1d1d1d1d1d1dULL); 4562082Seschrock } 4572082Seschrock } 4582082Seschrock } 4592082Seschrock 4602082Seschrock src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 4612082Seschrock dst = rm->rm_col[x].rc_data; 4622082Seschrock exp = 255 - (rm->rm_cols - 1 - x); 4632082Seschrock 4642082Seschrock for (i = 0; i < xcount; i++, dst++, src++) { 4652082Seschrock *dst ^= *src; 4662082Seschrock for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 4672082Seschrock *b = vdev_raidz_exp2(*b, exp); 4682082Seschrock } 4692082Seschrock } 4702082Seschrock } 4712082Seschrock 4722082Seschrock static void 4732082Seschrock vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y) 4742082Seschrock { 4752082Seschrock uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 4762082Seschrock void *pdata, *qdata; 4772082Seschrock uint64_t xsize, ysize, i; 4782082Seschrock 4792082Seschrock ASSERT(x < y); 4802082Seschrock ASSERT(x >= rm->rm_firstdatacol); 4812082Seschrock ASSERT(y < rm->rm_cols); 4822082Seschrock 4832082Seschrock ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 4842082Seschrock 4852082Seschrock /* 4862082Seschrock * Move the parity data aside -- we're going to compute parity as 4872082Seschrock * though columns x and y were full of zeros -- Pxy and Qxy. We want to 4882082Seschrock * reuse the parity generation mechanism without trashing the actual 4892082Seschrock * parity so we make those columns appear to be full of zeros by 4902082Seschrock * setting their lengths to zero. 4912082Seschrock */ 4922082Seschrock pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 4932082Seschrock qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 4942082Seschrock xsize = rm->rm_col[x].rc_size; 4952082Seschrock ysize = rm->rm_col[y].rc_size; 4962082Seschrock 4972082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_data = 4982082Seschrock zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 4992082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_data = 5002082Seschrock zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 5012082Seschrock rm->rm_col[x].rc_size = 0; 5022082Seschrock rm->rm_col[y].rc_size = 0; 5032082Seschrock 5042082Seschrock vdev_raidz_generate_parity_pq(rm); 5052082Seschrock 5062082Seschrock rm->rm_col[x].rc_size = xsize; 5072082Seschrock rm->rm_col[y].rc_size = ysize; 5082082Seschrock 5092082Seschrock p = pdata; 5102082Seschrock q = qdata; 5112082Seschrock pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 5122082Seschrock qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 5132082Seschrock xd = rm->rm_col[x].rc_data; 5142082Seschrock yd = rm->rm_col[y].rc_data; 5152082Seschrock 5162082Seschrock /* 5172082Seschrock * We now have: 5182082Seschrock * Pxy = P + D_x + D_y 5192082Seschrock * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 5202082Seschrock * 5212082Seschrock * We can then solve for D_x: 5222082Seschrock * D_x = A * (P + Pxy) + B * (Q + Qxy) 5232082Seschrock * where 5242082Seschrock * A = 2^(x - y) * (2^(x - y) + 1)^-1 5252082Seschrock * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 5262082Seschrock * 5272082Seschrock * With D_x in hand, we can easily solve for D_y: 5282082Seschrock * D_y = P + Pxy + D_x 5292082Seschrock */ 5302082Seschrock 5312082Seschrock a = vdev_raidz_pow2[255 + x - y]; 5322082Seschrock b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 5332082Seschrock tmp = 255 - vdev_raidz_log2[a ^ 1]; 5342082Seschrock 5352082Seschrock aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 5362082Seschrock bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 5372082Seschrock 5382082Seschrock for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 5392082Seschrock *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 5402082Seschrock vdev_raidz_exp2(*q ^ *qxy, bexp); 5412082Seschrock 5422082Seschrock if (i < ysize) 5432082Seschrock *yd = *p ^ *pxy ^ *xd; 5442082Seschrock } 5452082Seschrock 5462082Seschrock zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 5472082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_size); 5482082Seschrock zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 5492082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_size); 5502082Seschrock 5512082Seschrock /* 5522082Seschrock * Restore the saved parity data. 5532082Seschrock */ 5542082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 5552082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 5562082Seschrock } 5572082Seschrock 5582082Seschrock 559789Sahrens static int 560789Sahrens vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) 561789Sahrens { 5622082Seschrock uint64_t nparity = vd->vdev_nparity; 563789Sahrens int lasterror = 0; 564789Sahrens int numerrors = 0; 565789Sahrens 5662082Seschrock ASSERT(nparity > 0); 5672082Seschrock 5682082Seschrock if (nparity > VDEV_RAIDZ_MAXPARITY || 5692082Seschrock vd->vdev_children < nparity + 1) { 570789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 571789Sahrens return (EINVAL); 572789Sahrens } 573789Sahrens 574*9846SEric.Taylor@Sun.COM vdev_open_children(vd); 575789Sahrens 576*9846SEric.Taylor@Sun.COM for (int c = 0; c < vd->vdev_children; c++) { 577*9846SEric.Taylor@Sun.COM vdev_t *cvd = vd->vdev_child[c]; 578*9846SEric.Taylor@Sun.COM 579*9846SEric.Taylor@Sun.COM if (cvd->vdev_open_error) { 580*9846SEric.Taylor@Sun.COM lasterror = cvd->vdev_open_error; 581789Sahrens numerrors++; 582789Sahrens continue; 583789Sahrens } 584789Sahrens 585789Sahrens *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 5861732Sbonwick *ashift = MAX(*ashift, cvd->vdev_ashift); 587789Sahrens } 588789Sahrens 589789Sahrens *asize *= vd->vdev_children; 590789Sahrens 5912082Seschrock if (numerrors > nparity) { 592789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 593789Sahrens return (lasterror); 594789Sahrens } 595789Sahrens 596789Sahrens return (0); 597789Sahrens } 598789Sahrens 599789Sahrens static void 600789Sahrens vdev_raidz_close(vdev_t *vd) 601789Sahrens { 602*9846SEric.Taylor@Sun.COM for (int c = 0; c < vd->vdev_children; c++) 603789Sahrens vdev_close(vd->vdev_child[c]); 604789Sahrens } 605789Sahrens 606789Sahrens static uint64_t 607789Sahrens vdev_raidz_asize(vdev_t *vd, uint64_t psize) 608789Sahrens { 609789Sahrens uint64_t asize; 6101732Sbonwick uint64_t ashift = vd->vdev_top->vdev_ashift; 611789Sahrens uint64_t cols = vd->vdev_children; 6122082Seschrock uint64_t nparity = vd->vdev_nparity; 613789Sahrens 6141732Sbonwick asize = ((psize - 1) >> ashift) + 1; 6152082Seschrock asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 6162082Seschrock asize = roundup(asize, nparity + 1) << ashift; 617789Sahrens 618789Sahrens return (asize); 619789Sahrens } 620789Sahrens 621789Sahrens static void 622789Sahrens vdev_raidz_child_done(zio_t *zio) 623789Sahrens { 624789Sahrens raidz_col_t *rc = zio->io_private; 625789Sahrens 626789Sahrens rc->rc_error = zio->io_error; 627789Sahrens rc->rc_tried = 1; 628789Sahrens rc->rc_skipped = 0; 629789Sahrens } 630789Sahrens 6315530Sbonwick static int 632789Sahrens vdev_raidz_io_start(zio_t *zio) 633789Sahrens { 634789Sahrens vdev_t *vd = zio->io_vd; 6351732Sbonwick vdev_t *tvd = vd->vdev_top; 636789Sahrens vdev_t *cvd; 637789Sahrens blkptr_t *bp = zio->io_bp; 638789Sahrens raidz_map_t *rm; 639789Sahrens raidz_col_t *rc; 640789Sahrens int c; 641789Sahrens 6422082Seschrock rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, 6432082Seschrock vd->vdev_nparity); 644789Sahrens 6451775Sbillm ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 646789Sahrens 647789Sahrens if (zio->io_type == ZIO_TYPE_WRITE) { 648789Sahrens /* 6492082Seschrock * Generate RAID parity in the first virtual columns. 650789Sahrens */ 6512082Seschrock if (rm->rm_firstdatacol == 1) 6522082Seschrock vdev_raidz_generate_parity_p(rm); 6532082Seschrock else 6542082Seschrock vdev_raidz_generate_parity_pq(rm); 655789Sahrens 656789Sahrens for (c = 0; c < rm->rm_cols; c++) { 657789Sahrens rc = &rm->rm_col[c]; 6582082Seschrock cvd = vd->vdev_child[rc->rc_devidx]; 659789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 660789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 6617754SJeff.Bonwick@Sun.COM zio->io_type, zio->io_priority, 0, 662789Sahrens vdev_raidz_child_done, rc)); 663789Sahrens } 6645530Sbonwick 6657754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 666789Sahrens } 667789Sahrens 668789Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ); 669789Sahrens 6702082Seschrock /* 6712082Seschrock * Iterate over the columns in reverse order so that we hit the parity 6722082Seschrock * last -- any errors along the way will force us to read the parity 6732082Seschrock * data. 6742082Seschrock */ 675789Sahrens for (c = rm->rm_cols - 1; c >= 0; c--) { 676789Sahrens rc = &rm->rm_col[c]; 6772082Seschrock cvd = vd->vdev_child[rc->rc_devidx]; 6785329Sgw25295 if (!vdev_readable(cvd)) { 6792082Seschrock if (c >= rm->rm_firstdatacol) 6802082Seschrock rm->rm_missingdata++; 6812082Seschrock else 6822082Seschrock rm->rm_missingparity++; 683789Sahrens rc->rc_error = ENXIO; 684789Sahrens rc->rc_tried = 1; /* don't even try */ 685789Sahrens rc->rc_skipped = 1; 686789Sahrens continue; 687789Sahrens } 6888241SJeff.Bonwick@Sun.COM if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) { 6892082Seschrock if (c >= rm->rm_firstdatacol) 6902082Seschrock rm->rm_missingdata++; 6912082Seschrock else 6922082Seschrock rm->rm_missingparity++; 693789Sahrens rc->rc_error = ESTALE; 694789Sahrens rc->rc_skipped = 1; 695789Sahrens continue; 696789Sahrens } 6972082Seschrock if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 6989434SMark.Musante@Sun.COM (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 699789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 700789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 7017754SJeff.Bonwick@Sun.COM zio->io_type, zio->io_priority, 0, 702789Sahrens vdev_raidz_child_done, rc)); 703789Sahrens } 704789Sahrens } 705789Sahrens 7067754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 707789Sahrens } 708789Sahrens 7091544Seschrock /* 7101544Seschrock * Report a checksum error for a child of a RAID-Z device. 7111544Seschrock */ 7121544Seschrock static void 7131544Seschrock raidz_checksum_error(zio_t *zio, raidz_col_t *rc) 7141544Seschrock { 7152082Seschrock vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 7161544Seschrock 7171544Seschrock if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 7181544Seschrock mutex_enter(&vd->vdev_stat_lock); 7191544Seschrock vd->vdev_stat.vs_checksum_errors++; 7201544Seschrock mutex_exit(&vd->vdev_stat_lock); 7211544Seschrock } 7221544Seschrock 7231544Seschrock if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 7241544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 7251544Seschrock zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size); 7261544Seschrock } 7271544Seschrock 7282082Seschrock /* 7292082Seschrock * Generate the parity from the data columns. If we tried and were able to 7302082Seschrock * read the parity without error, verify that the generated parity matches the 7312082Seschrock * data we read. If it doesn't, we fire off a checksum error. Return the 7322082Seschrock * number such failures. 7332082Seschrock */ 7342082Seschrock static int 7352082Seschrock raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 7362082Seschrock { 7372082Seschrock void *orig[VDEV_RAIDZ_MAXPARITY]; 7382082Seschrock int c, ret = 0; 7392082Seschrock raidz_col_t *rc; 7402082Seschrock 7412082Seschrock for (c = 0; c < rm->rm_firstdatacol; c++) { 7422082Seschrock rc = &rm->rm_col[c]; 7432082Seschrock if (!rc->rc_tried || rc->rc_error != 0) 7442082Seschrock continue; 7452082Seschrock orig[c] = zio_buf_alloc(rc->rc_size); 7462082Seschrock bcopy(rc->rc_data, orig[c], rc->rc_size); 7472082Seschrock } 7482082Seschrock 7492082Seschrock if (rm->rm_firstdatacol == 1) 7502082Seschrock vdev_raidz_generate_parity_p(rm); 7512082Seschrock else 7522082Seschrock vdev_raidz_generate_parity_pq(rm); 7532082Seschrock 7542082Seschrock for (c = 0; c < rm->rm_firstdatacol; c++) { 7552082Seschrock rc = &rm->rm_col[c]; 7562082Seschrock if (!rc->rc_tried || rc->rc_error != 0) 7572082Seschrock continue; 7582082Seschrock if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 7592082Seschrock raidz_checksum_error(zio, rc); 7602082Seschrock rc->rc_error = ECKSUM; 7612082Seschrock ret++; 7622082Seschrock } 7632082Seschrock zio_buf_free(orig[c], rc->rc_size); 7642082Seschrock } 7652082Seschrock 7662082Seschrock return (ret); 7672082Seschrock } 7682082Seschrock 7692082Seschrock static uint64_t raidz_corrected_p; 7702082Seschrock static uint64_t raidz_corrected_q; 7712082Seschrock static uint64_t raidz_corrected_pq; 7721544Seschrock 7735530Sbonwick static int 7747754SJeff.Bonwick@Sun.COM vdev_raidz_worst_error(raidz_map_t *rm) 7757754SJeff.Bonwick@Sun.COM { 7767754SJeff.Bonwick@Sun.COM int error = 0; 7777754SJeff.Bonwick@Sun.COM 7787754SJeff.Bonwick@Sun.COM for (int c = 0; c < rm->rm_cols; c++) 7797754SJeff.Bonwick@Sun.COM error = zio_worst_error(error, rm->rm_col[c].rc_error); 7807754SJeff.Bonwick@Sun.COM 7817754SJeff.Bonwick@Sun.COM return (error); 7827754SJeff.Bonwick@Sun.COM } 7837754SJeff.Bonwick@Sun.COM 7847754SJeff.Bonwick@Sun.COM static void 785789Sahrens vdev_raidz_io_done(zio_t *zio) 786789Sahrens { 787789Sahrens vdev_t *vd = zio->io_vd; 788789Sahrens vdev_t *cvd; 789789Sahrens raidz_map_t *rm = zio->io_vsd; 7902082Seschrock raidz_col_t *rc, *rc1; 791789Sahrens int unexpected_errors = 0; 7922082Seschrock int parity_errors = 0; 7933456Sahl int parity_untried = 0; 7942082Seschrock int data_errors = 0; 7957754SJeff.Bonwick@Sun.COM int total_errors = 0; 7962082Seschrock int n, c, c1; 797789Sahrens 7981775Sbillm ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 799789Sahrens 8002082Seschrock ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 8012082Seschrock ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 8022082Seschrock 803789Sahrens for (c = 0; c < rm->rm_cols; c++) { 804789Sahrens rc = &rm->rm_col[c]; 805789Sahrens 806789Sahrens if (rc->rc_error) { 8077754SJeff.Bonwick@Sun.COM ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 8082082Seschrock 8092082Seschrock if (c < rm->rm_firstdatacol) 8102082Seschrock parity_errors++; 8112082Seschrock else 8122082Seschrock data_errors++; 8132082Seschrock 814789Sahrens if (!rc->rc_skipped) 815789Sahrens unexpected_errors++; 8162082Seschrock 8177754SJeff.Bonwick@Sun.COM total_errors++; 8183456Sahl } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 8193456Sahl parity_untried++; 820789Sahrens } 821789Sahrens } 822789Sahrens 823789Sahrens if (zio->io_type == ZIO_TYPE_WRITE) { 824789Sahrens /* 8257754SJeff.Bonwick@Sun.COM * XXX -- for now, treat partial writes as a success. 8267754SJeff.Bonwick@Sun.COM * (If we couldn't write enough columns to reconstruct 8277754SJeff.Bonwick@Sun.COM * the data, the I/O failed. Otherwise, good enough.) 8287754SJeff.Bonwick@Sun.COM * 8297754SJeff.Bonwick@Sun.COM * Now that we support write reallocation, it would be better 8307754SJeff.Bonwick@Sun.COM * to treat partial failure as real failure unless there are 8317754SJeff.Bonwick@Sun.COM * no non-degraded top-level vdevs left, and not update DTLs 8327754SJeff.Bonwick@Sun.COM * if we intend to reallocate. 833789Sahrens */ 834789Sahrens /* XXPOLICY */ 8357754SJeff.Bonwick@Sun.COM if (total_errors > rm->rm_firstdatacol) 8367754SJeff.Bonwick@Sun.COM zio->io_error = vdev_raidz_worst_error(rm); 837789Sahrens 8387754SJeff.Bonwick@Sun.COM return; 839789Sahrens } 840789Sahrens 841789Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ); 8422082Seschrock /* 8432082Seschrock * There are three potential phases for a read: 8442082Seschrock * 1. produce valid data from the columns read 8452082Seschrock * 2. read all disks and try again 8462082Seschrock * 3. perform combinatorial reconstruction 8472082Seschrock * 8482082Seschrock * Each phase is progressively both more expensive and less likely to 8492082Seschrock * occur. If we encounter more errors than we can repair or all phases 8502082Seschrock * fail, we have no choice but to return an error. 8512082Seschrock */ 852789Sahrens 853789Sahrens /* 8542082Seschrock * If the number of errors we saw was correctable -- less than or equal 8553456Sahl * to the number of parity disks read -- attempt to produce data that 8563456Sahl * has a valid checksum. Naturally, this case applies in the absence of 8573456Sahl * any errors. 858789Sahrens */ 8597754SJeff.Bonwick@Sun.COM if (total_errors <= rm->rm_firstdatacol - parity_untried) { 8602082Seschrock switch (data_errors) { 8612082Seschrock case 0: 8622082Seschrock if (zio_checksum_error(zio) == 0) { 8634034Sahl /* 8644034Sahl * If we read parity information (unnecessarily 8654034Sahl * as it happens since no reconstruction was 8664034Sahl * needed) regenerate and verify the parity. 8674034Sahl * We also regenerate parity when resilvering 8684034Sahl * so we can write it out to the failed device 8694034Sahl * later. 8704034Sahl */ 8713456Sahl if (parity_errors + parity_untried < 8724034Sahl rm->rm_firstdatacol || 8734034Sahl (zio->io_flags & ZIO_FLAG_RESILVER)) { 8743456Sahl n = raidz_parity_verify(zio, rm); 8753456Sahl unexpected_errors += n; 8763456Sahl ASSERT(parity_errors + n <= 8773456Sahl rm->rm_firstdatacol); 8783456Sahl } 8792082Seschrock goto done; 8802082Seschrock } 8812082Seschrock break; 8822082Seschrock 8832082Seschrock case 1: 8843456Sahl /* 8853456Sahl * We either attempt to read all the parity columns or 8863456Sahl * none of them. If we didn't try to read parity, we 8873456Sahl * wouldn't be here in the correctable case. There must 8883456Sahl * also have been fewer parity errors than parity 8893456Sahl * columns or, again, we wouldn't be in this code path. 8903456Sahl */ 8913456Sahl ASSERT(parity_untried == 0); 8922082Seschrock ASSERT(parity_errors < rm->rm_firstdatacol); 8932082Seschrock 8942082Seschrock /* 8952082Seschrock * Find the column that reported the error. 8962082Seschrock */ 8972082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 8982082Seschrock rc = &rm->rm_col[c]; 8992082Seschrock if (rc->rc_error != 0) 9002082Seschrock break; 9012082Seschrock } 9022082Seschrock ASSERT(c != rm->rm_cols); 9032082Seschrock ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || 9042082Seschrock rc->rc_error == ESTALE); 9052082Seschrock 9062082Seschrock if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { 9072082Seschrock vdev_raidz_reconstruct_p(rm, c); 9082082Seschrock } else { 9092082Seschrock ASSERT(rm->rm_firstdatacol > 1); 9102082Seschrock vdev_raidz_reconstruct_q(rm, c); 9112082Seschrock } 9122082Seschrock 9132082Seschrock if (zio_checksum_error(zio) == 0) { 9142082Seschrock if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) 9152082Seschrock atomic_inc_64(&raidz_corrected_p); 9162082Seschrock else 9172082Seschrock atomic_inc_64(&raidz_corrected_q); 918789Sahrens 9192082Seschrock /* 9203456Sahl * If there's more than one parity disk that 9213456Sahl * was successfully read, confirm that the 9223456Sahl * other parity disk produced the correct data. 9233456Sahl * This routine is suboptimal in that it 9243456Sahl * regenerates both the parity we wish to test 9253456Sahl * as well as the parity we just used to 9263456Sahl * perform the reconstruction, but this should 9273456Sahl * be a relatively uncommon case, and can be 9283456Sahl * optimized if it becomes a problem. 9294034Sahl * We also regenerate parity when resilvering 9304034Sahl * so we can write it out to the failed device 9314034Sahl * later. 9322082Seschrock */ 9334034Sahl if (parity_errors < rm->rm_firstdatacol - 1 || 9344034Sahl (zio->io_flags & ZIO_FLAG_RESILVER)) { 9352082Seschrock n = raidz_parity_verify(zio, rm); 9362082Seschrock unexpected_errors += n; 9372082Seschrock ASSERT(parity_errors + n <= 9382082Seschrock rm->rm_firstdatacol); 9392082Seschrock } 9402082Seschrock 9412082Seschrock goto done; 9422082Seschrock } 9432082Seschrock break; 9442082Seschrock 9452082Seschrock case 2: 9462082Seschrock /* 9473456Sahl * Two data column errors require double parity. 9483456Sahl */ 9493456Sahl ASSERT(rm->rm_firstdatacol == 2); 9503456Sahl 9513456Sahl /* 9522082Seschrock * Find the two columns that reported errors. 9532082Seschrock */ 9542082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 9552082Seschrock rc = &rm->rm_col[c]; 9562082Seschrock if (rc->rc_error != 0) 9572082Seschrock break; 958789Sahrens } 9592082Seschrock ASSERT(c != rm->rm_cols); 9602082Seschrock ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || 9612082Seschrock rc->rc_error == ESTALE); 9622082Seschrock 9632082Seschrock for (c1 = c++; c < rm->rm_cols; c++) { 9642082Seschrock rc = &rm->rm_col[c]; 9652082Seschrock if (rc->rc_error != 0) 9662082Seschrock break; 9672082Seschrock } 9682082Seschrock ASSERT(c != rm->rm_cols); 9692082Seschrock ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || 9702082Seschrock rc->rc_error == ESTALE); 971789Sahrens 9722082Seschrock vdev_raidz_reconstruct_pq(rm, c1, c); 9732082Seschrock 9742082Seschrock if (zio_checksum_error(zio) == 0) { 9752082Seschrock atomic_inc_64(&raidz_corrected_pq); 9762082Seschrock goto done; 9772082Seschrock } 9782082Seschrock break; 9792082Seschrock 9802082Seschrock default: 9812082Seschrock ASSERT(rm->rm_firstdatacol <= 2); 9822082Seschrock ASSERT(0); 983789Sahrens } 984789Sahrens } 985789Sahrens 986789Sahrens /* 9872082Seschrock * This isn't a typical situation -- either we got a read error or 9882082Seschrock * a child silently returned bad data. Read every block so we can 9892082Seschrock * try again with as much data and parity as we can track down. If 9902082Seschrock * we've already been through once before, all children will be marked 9912082Seschrock * as tried so we'll proceed to combinatorial reconstruction. 992789Sahrens */ 993789Sahrens unexpected_errors = 1; 9942082Seschrock rm->rm_missingdata = 0; 9952082Seschrock rm->rm_missingparity = 0; 996789Sahrens 9972082Seschrock for (c = 0; c < rm->rm_cols; c++) { 9982082Seschrock if (rm->rm_col[c].rc_tried) 9992082Seschrock continue; 1000789Sahrens 1001789Sahrens zio_vdev_io_redone(zio); 10022082Seschrock do { 1003789Sahrens rc = &rm->rm_col[c]; 1004789Sahrens if (rc->rc_tried) 1005789Sahrens continue; 1006789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, 10072082Seschrock vd->vdev_child[rc->rc_devidx], 1008789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 10097754SJeff.Bonwick@Sun.COM zio->io_type, zio->io_priority, 0, 1010789Sahrens vdev_raidz_child_done, rc)); 10112082Seschrock } while (++c < rm->rm_cols); 10125530Sbonwick 10137754SJeff.Bonwick@Sun.COM return; 1014789Sahrens } 1015789Sahrens 1016789Sahrens /* 10172082Seschrock * At this point we've attempted to reconstruct the data given the 10182082Seschrock * errors we detected, and we've attempted to read all columns. There 10192082Seschrock * must, therefore, be one or more additional problems -- silent errors 10202082Seschrock * resulting in invalid data rather than explicit I/O errors resulting 10212082Seschrock * in absent data. Before we attempt combinatorial reconstruction make 10222082Seschrock * sure we have a chance of coming up with the right answer. 1023789Sahrens */ 10247754SJeff.Bonwick@Sun.COM if (total_errors >= rm->rm_firstdatacol) { 10257754SJeff.Bonwick@Sun.COM zio->io_error = vdev_raidz_worst_error(rm); 10267754SJeff.Bonwick@Sun.COM /* 10277754SJeff.Bonwick@Sun.COM * If there were exactly as many device errors as parity 10287754SJeff.Bonwick@Sun.COM * columns, yet we couldn't reconstruct the data, then at 10297754SJeff.Bonwick@Sun.COM * least one device must have returned bad data silently. 10307754SJeff.Bonwick@Sun.COM */ 10317754SJeff.Bonwick@Sun.COM if (total_errors == rm->rm_firstdatacol) 10327754SJeff.Bonwick@Sun.COM zio->io_error = zio_worst_error(zio->io_error, ECKSUM); 1033789Sahrens goto done; 1034789Sahrens } 1035789Sahrens 10362082Seschrock if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { 10372082Seschrock /* 10382082Seschrock * Attempt to reconstruct the data from parity P. 10392082Seschrock */ 10402082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 10412082Seschrock void *orig; 10422082Seschrock rc = &rm->rm_col[c]; 10432082Seschrock 10442082Seschrock orig = zio_buf_alloc(rc->rc_size); 10452082Seschrock bcopy(rc->rc_data, orig, rc->rc_size); 10462082Seschrock vdev_raidz_reconstruct_p(rm, c); 10472082Seschrock 10482082Seschrock if (zio_checksum_error(zio) == 0) { 10492082Seschrock zio_buf_free(orig, rc->rc_size); 10502082Seschrock atomic_inc_64(&raidz_corrected_p); 10512082Seschrock 10522082Seschrock /* 10532082Seschrock * If this child didn't know that it returned 10542082Seschrock * bad data, inform it. 10552082Seschrock */ 10562082Seschrock if (rc->rc_tried && rc->rc_error == 0) 10572082Seschrock raidz_checksum_error(zio, rc); 10582082Seschrock rc->rc_error = ECKSUM; 10592082Seschrock goto done; 10602082Seschrock } 10612082Seschrock 10622082Seschrock bcopy(orig, rc->rc_data, rc->rc_size); 10632082Seschrock zio_buf_free(orig, rc->rc_size); 10642082Seschrock } 10652082Seschrock } 10662082Seschrock 10672082Seschrock if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { 10682082Seschrock /* 10692082Seschrock * Attempt to reconstruct the data from parity Q. 10702082Seschrock */ 10712082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 10722082Seschrock void *orig; 10732082Seschrock rc = &rm->rm_col[c]; 10742082Seschrock 10752082Seschrock orig = zio_buf_alloc(rc->rc_size); 10762082Seschrock bcopy(rc->rc_data, orig, rc->rc_size); 10772082Seschrock vdev_raidz_reconstruct_q(rm, c); 10782082Seschrock 10792082Seschrock if (zio_checksum_error(zio) == 0) { 10802082Seschrock zio_buf_free(orig, rc->rc_size); 10812082Seschrock atomic_inc_64(&raidz_corrected_q); 10822082Seschrock 10832082Seschrock /* 10842082Seschrock * If this child didn't know that it returned 10852082Seschrock * bad data, inform it. 10862082Seschrock */ 10872082Seschrock if (rc->rc_tried && rc->rc_error == 0) 10882082Seschrock raidz_checksum_error(zio, rc); 10892082Seschrock rc->rc_error = ECKSUM; 10902082Seschrock goto done; 10912082Seschrock } 10922082Seschrock 10932082Seschrock bcopy(orig, rc->rc_data, rc->rc_size); 10942082Seschrock zio_buf_free(orig, rc->rc_size); 10952082Seschrock } 10962082Seschrock } 10972082Seschrock 10982082Seschrock if (rm->rm_firstdatacol > 1 && 10992082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 && 11002082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { 11012082Seschrock /* 11022082Seschrock * Attempt to reconstruct the data from both P and Q. 11032082Seschrock */ 11042082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) { 11052082Seschrock void *orig, *orig1; 11062082Seschrock rc = &rm->rm_col[c]; 11072082Seschrock 11082082Seschrock orig = zio_buf_alloc(rc->rc_size); 11092082Seschrock bcopy(rc->rc_data, orig, rc->rc_size); 11102082Seschrock 11112082Seschrock for (c1 = c + 1; c1 < rm->rm_cols; c1++) { 11122082Seschrock rc1 = &rm->rm_col[c1]; 11132082Seschrock 11142082Seschrock orig1 = zio_buf_alloc(rc1->rc_size); 11152082Seschrock bcopy(rc1->rc_data, orig1, rc1->rc_size); 11162082Seschrock 11172082Seschrock vdev_raidz_reconstruct_pq(rm, c, c1); 11182082Seschrock 11192082Seschrock if (zio_checksum_error(zio) == 0) { 11202082Seschrock zio_buf_free(orig, rc->rc_size); 11212082Seschrock zio_buf_free(orig1, rc1->rc_size); 11222082Seschrock atomic_inc_64(&raidz_corrected_pq); 11232082Seschrock 11242082Seschrock /* 11252082Seschrock * If these children didn't know they 11262082Seschrock * returned bad data, inform them. 11272082Seschrock */ 11282082Seschrock if (rc->rc_tried && rc->rc_error == 0) 11292082Seschrock raidz_checksum_error(zio, rc); 11302082Seschrock if (rc1->rc_tried && rc1->rc_error == 0) 11312082Seschrock raidz_checksum_error(zio, rc1); 11322082Seschrock 11332082Seschrock rc->rc_error = ECKSUM; 11342082Seschrock rc1->rc_error = ECKSUM; 11352082Seschrock 11362082Seschrock goto done; 11372082Seschrock } 11382082Seschrock 11392082Seschrock bcopy(orig1, rc1->rc_data, rc1->rc_size); 11402082Seschrock zio_buf_free(orig1, rc1->rc_size); 11412082Seschrock } 11422082Seschrock 11432082Seschrock bcopy(orig, rc->rc_data, rc->rc_size); 11442082Seschrock zio_buf_free(orig, rc->rc_size); 1145789Sahrens } 1146789Sahrens } 1147789Sahrens 1148789Sahrens /* 11492082Seschrock * All combinations failed to checksum. Generate checksum ereports for 11502082Seschrock * all children. 1151789Sahrens */ 1152789Sahrens zio->io_error = ECKSUM; 11537754SJeff.Bonwick@Sun.COM 11541544Seschrock if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 11551544Seschrock for (c = 0; c < rm->rm_cols; c++) { 11561544Seschrock rc = &rm->rm_col[c]; 11571544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 11582082Seschrock zio->io_spa, vd->vdev_child[rc->rc_devidx], zio, 11591544Seschrock rc->rc_offset, rc->rc_size); 11601544Seschrock } 11611544Seschrock } 1162789Sahrens 1163789Sahrens done: 1164789Sahrens zio_checksum_verified(zio); 1165789Sahrens 11668241SJeff.Bonwick@Sun.COM if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 1167789Sahrens (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 1168789Sahrens /* 1169789Sahrens * Use the good data we have in hand to repair damaged children. 1170789Sahrens */ 1171789Sahrens for (c = 0; c < rm->rm_cols; c++) { 1172789Sahrens rc = &rm->rm_col[c]; 11732082Seschrock cvd = vd->vdev_child[rc->rc_devidx]; 1174789Sahrens 11751732Sbonwick if (rc->rc_error == 0) 11761732Sbonwick continue; 11771732Sbonwick 11787754SJeff.Bonwick@Sun.COM zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 11791732Sbonwick rc->rc_offset, rc->rc_data, rc->rc_size, 11801732Sbonwick ZIO_TYPE_WRITE, zio->io_priority, 11818241SJeff.Bonwick@Sun.COM ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 11828241SJeff.Bonwick@Sun.COM ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 11831732Sbonwick } 1184789Sahrens } 1185789Sahrens } 1186789Sahrens 1187789Sahrens static void 1188789Sahrens vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 1189789Sahrens { 11902082Seschrock if (faulted > vd->vdev_nparity) 11911544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 11921544Seschrock VDEV_AUX_NO_REPLICAS); 1193789Sahrens else if (degraded + faulted != 0) 11941544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 1195789Sahrens else 11961544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 1197789Sahrens } 1198789Sahrens 1199789Sahrens vdev_ops_t vdev_raidz_ops = { 1200789Sahrens vdev_raidz_open, 1201789Sahrens vdev_raidz_close, 1202789Sahrens vdev_raidz_asize, 1203789Sahrens vdev_raidz_io_start, 1204789Sahrens vdev_raidz_io_done, 1205789Sahrens vdev_raidz_state_change, 1206789Sahrens VDEV_TYPE_RAIDZ, /* name of this vdev type */ 1207789Sahrens B_FALSE /* not a leaf vdev */ 1208789Sahrens }; 1209