1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 212082Seschrock 22789Sahrens /* 233456Sahl * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24789Sahrens * Use is subject to license terms. 25789Sahrens */ 26789Sahrens 27789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28789Sahrens 29789Sahrens #include <sys/zfs_context.h> 30789Sahrens #include <sys/spa.h> 31789Sahrens #include <sys/vdev_impl.h> 32789Sahrens #include <sys/zio.h> 33789Sahrens #include <sys/zio_checksum.h> 34789Sahrens #include <sys/fs/zfs.h> 351544Seschrock #include <sys/fm/fs/zfs.h> 36789Sahrens 37789Sahrens /* 38789Sahrens * Virtual device vector for RAID-Z. 392082Seschrock * 402082Seschrock * This vdev supports both single and double parity. For single parity, we 412082Seschrock * use a simple XOR of all the data columns. For double parity, we use both 422082Seschrock * the simple XOR as well as a technique described in "The mathematics of 432082Seschrock * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8), 442082Seschrock * over the integers expressable in a single byte. Briefly, the operations on 452082Seschrock * the field are defined as follows: 462082Seschrock * 472082Seschrock * o addition (+) is represented by a bitwise XOR 482082Seschrock * o subtraction (-) is therefore identical to addition: A + B = A - B 492082Seschrock * o multiplication of A by 2 is defined by the following bitwise expression: 502082Seschrock * (A * 2)_7 = A_6 512082Seschrock * (A * 2)_6 = A_5 522082Seschrock * (A * 2)_5 = A_4 532082Seschrock * (A * 2)_4 = A_3 + A_7 542082Seschrock * (A * 2)_3 = A_2 + A_7 552082Seschrock * (A * 2)_2 = A_1 + A_7 562082Seschrock * (A * 2)_1 = A_0 572082Seschrock * (A * 2)_0 = A_7 582082Seschrock * 592082Seschrock * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 602082Seschrock * 612082Seschrock * Observe that any number in the field (except for 0) can be expressed as a 622082Seschrock * power of 2 -- a generator for the field. We store a table of the powers of 632082Seschrock * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 642082Seschrock * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 652082Seschrock * than field addition). The inverse of a field element A (A^-1) is A^254. 662082Seschrock * 672082Seschrock * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1, 682082Seschrock * can be expressed by field operations: 692082Seschrock * 702082Seschrock * P = D_0 + D_1 + ... + D_n-2 + D_n-1 712082Seschrock * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 722082Seschrock * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 732082Seschrock * 742082Seschrock * See the reconstruction code below for how P and Q can used individually or 752082Seschrock * in concert to recover missing data columns. 76789Sahrens */ 77789Sahrens 78789Sahrens typedef struct raidz_col { 792082Seschrock uint64_t rc_devidx; /* child device index for I/O */ 802082Seschrock uint64_t rc_offset; /* device offset */ 812082Seschrock uint64_t rc_size; /* I/O size */ 822082Seschrock void *rc_data; /* I/O data */ 832082Seschrock int rc_error; /* I/O error for this device */ 842082Seschrock uint8_t rc_tried; /* Did we attempt this I/O column? */ 852082Seschrock uint8_t rc_skipped; /* Did we skip this I/O column? */ 86789Sahrens } raidz_col_t; 87789Sahrens 88789Sahrens typedef struct raidz_map { 892082Seschrock uint64_t rm_cols; /* Column count */ 902082Seschrock uint64_t rm_bigcols; /* Number of oversized columns */ 912082Seschrock uint64_t rm_asize; /* Actual total I/O size */ 922082Seschrock uint64_t rm_missingdata; /* Count of missing data devices */ 932082Seschrock uint64_t rm_missingparity; /* Count of missing parity devices */ 942082Seschrock uint64_t rm_firstdatacol; /* First data column/parity count */ 952082Seschrock raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 96789Sahrens } raidz_map_t; 97789Sahrens 982082Seschrock #define VDEV_RAIDZ_P 0 992082Seschrock #define VDEV_RAIDZ_Q 1 1002082Seschrock 1012082Seschrock #define VDEV_RAIDZ_MAXPARITY 2 1022082Seschrock 1032082Seschrock #define VDEV_RAIDZ_MUL_2(a) (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0)) 1042082Seschrock 1052082Seschrock /* 1062082Seschrock * These two tables represent powers and logs of 2 in the Galois field defined 1072082Seschrock * above. These values were computed by repeatedly multiplying by 2 as above. 1082082Seschrock */ 1092082Seschrock static const uint8_t vdev_raidz_pow2[256] = { 1102082Seschrock 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 1112082Seschrock 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 1122082Seschrock 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 1132082Seschrock 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 1142082Seschrock 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 1152082Seschrock 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 1162082Seschrock 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 1172082Seschrock 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 1182082Seschrock 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 1192082Seschrock 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 1202082Seschrock 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 1212082Seschrock 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 1222082Seschrock 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 1232082Seschrock 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 1242082Seschrock 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 1252082Seschrock 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 1262082Seschrock 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 1272082Seschrock 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 1282082Seschrock 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 1292082Seschrock 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 1302082Seschrock 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 1312082Seschrock 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 1322082Seschrock 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 1332082Seschrock 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 1342082Seschrock 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 1352082Seschrock 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 1362082Seschrock 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 1372082Seschrock 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 1382082Seschrock 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 1392082Seschrock 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 1402082Seschrock 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 1412082Seschrock 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 1422082Seschrock }; 1432082Seschrock static const uint8_t vdev_raidz_log2[256] = { 1442082Seschrock 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 1452082Seschrock 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 1462082Seschrock 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 1472082Seschrock 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 1482082Seschrock 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 1492082Seschrock 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 1502082Seschrock 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 1512082Seschrock 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 1522082Seschrock 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 1532082Seschrock 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 1542082Seschrock 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 1552082Seschrock 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 1562082Seschrock 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 1572082Seschrock 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 1582082Seschrock 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 1592082Seschrock 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 1602082Seschrock 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 1612082Seschrock 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 1622082Seschrock 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 1632082Seschrock 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 1642082Seschrock 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 1652082Seschrock 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 1662082Seschrock 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 1672082Seschrock 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 1682082Seschrock 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 1692082Seschrock 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 1702082Seschrock 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 1712082Seschrock 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 1722082Seschrock 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 1732082Seschrock 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 1742082Seschrock 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 1752082Seschrock 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 1762082Seschrock }; 1772082Seschrock 1782082Seschrock /* 1792082Seschrock * Multiply a given number by 2 raised to the given power. 1802082Seschrock */ 1812082Seschrock static uint8_t 1822082Seschrock vdev_raidz_exp2(uint_t a, int exp) 1832082Seschrock { 1842082Seschrock if (a == 0) 1852082Seschrock return (0); 1862082Seschrock 1872082Seschrock ASSERT(exp >= 0); 1882082Seschrock ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 1892082Seschrock 1902082Seschrock exp += vdev_raidz_log2[a]; 1912082Seschrock if (exp > 255) 1922082Seschrock exp -= 255; 1932082Seschrock 1942082Seschrock return (vdev_raidz_pow2[exp]); 1952082Seschrock } 1962082Seschrock 197789Sahrens static raidz_map_t * 1982082Seschrock vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, 1992082Seschrock uint64_t nparity) 200789Sahrens { 201789Sahrens raidz_map_t *rm; 202789Sahrens uint64_t b = zio->io_offset >> unit_shift; 203789Sahrens uint64_t s = zio->io_size >> unit_shift; 204789Sahrens uint64_t f = b % dcols; 205789Sahrens uint64_t o = (b / dcols) << unit_shift; 2062082Seschrock uint64_t q, r, c, bc, col, acols, coff, devidx; 207789Sahrens 2082082Seschrock q = s / (dcols - nparity); 2092082Seschrock r = s - q * (dcols - nparity); 2102082Seschrock bc = (r == 0 ? 0 : r + nparity); 211789Sahrens 212789Sahrens acols = (q == 0 ? bc : dcols); 213789Sahrens 214789Sahrens rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP); 215789Sahrens 216789Sahrens rm->rm_cols = acols; 217789Sahrens rm->rm_bigcols = bc; 218789Sahrens rm->rm_asize = 0; 2192082Seschrock rm->rm_missingdata = 0; 2202082Seschrock rm->rm_missingparity = 0; 2212082Seschrock rm->rm_firstdatacol = nparity; 222789Sahrens 223789Sahrens for (c = 0; c < acols; c++) { 224789Sahrens col = f + c; 225789Sahrens coff = o; 226789Sahrens if (col >= dcols) { 227789Sahrens col -= dcols; 228789Sahrens coff += 1ULL << unit_shift; 229789Sahrens } 2302082Seschrock rm->rm_col[c].rc_devidx = col; 231789Sahrens rm->rm_col[c].rc_offset = coff; 232789Sahrens rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift; 233789Sahrens rm->rm_col[c].rc_data = NULL; 234789Sahrens rm->rm_col[c].rc_error = 0; 235789Sahrens rm->rm_col[c].rc_tried = 0; 236789Sahrens rm->rm_col[c].rc_skipped = 0; 237789Sahrens rm->rm_asize += rm->rm_col[c].rc_size; 238789Sahrens } 239789Sahrens 2402082Seschrock rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift); 241789Sahrens 242789Sahrens for (c = 0; c < rm->rm_firstdatacol; c++) 243789Sahrens rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); 244789Sahrens 245789Sahrens rm->rm_col[c].rc_data = zio->io_data; 246789Sahrens 247789Sahrens for (c = c + 1; c < acols; c++) 248789Sahrens rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 249789Sahrens rm->rm_col[c - 1].rc_size; 250789Sahrens 2511133Seschrock /* 2522082Seschrock * If all data stored spans all columns, there's a danger that parity 2532082Seschrock * will always be on the same device and, since parity isn't read 2542082Seschrock * during normal operation, that that device's I/O bandwidth won't be 2552082Seschrock * used effectively. We therefore switch the parity every 1MB. 2562082Seschrock * 2572082Seschrock * ... at least that was, ostensibly, the theory. As a practical 2582082Seschrock * matter unless we juggle the parity between all devices evenly, we 2592082Seschrock * won't see any benefit. Further, occasional writes that aren't a 2602082Seschrock * multiple of the LCM of the number of children and the minimum 2612082Seschrock * stripe width are sufficient to avoid pessimal behavior. 2622082Seschrock * Unfortunately, this decision created an implicit on-disk format 2633456Sahl * requirement that we need to support for all eternity, but only 2643456Sahl * for single-parity RAID-Z. 2651133Seschrock */ 2661133Seschrock ASSERT(rm->rm_cols >= 2); 2671133Seschrock ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 268789Sahrens 2692082Seschrock if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 2702082Seschrock devidx = rm->rm_col[0].rc_devidx; 2711133Seschrock o = rm->rm_col[0].rc_offset; 2722082Seschrock rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 2731133Seschrock rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 2742082Seschrock rm->rm_col[1].rc_devidx = devidx; 2751133Seschrock rm->rm_col[1].rc_offset = o; 276789Sahrens } 277789Sahrens 278789Sahrens zio->io_vsd = rm; 279789Sahrens return (rm); 280789Sahrens } 281789Sahrens 282789Sahrens static void 283789Sahrens vdev_raidz_map_free(zio_t *zio) 284789Sahrens { 285789Sahrens raidz_map_t *rm = zio->io_vsd; 286789Sahrens int c; 287789Sahrens 288789Sahrens for (c = 0; c < rm->rm_firstdatacol; c++) 289789Sahrens zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 290789Sahrens 291789Sahrens kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols])); 292789Sahrens zio->io_vsd = NULL; 293789Sahrens } 294789Sahrens 295789Sahrens static void 2962082Seschrock vdev_raidz_generate_parity_p(raidz_map_t *rm) 2972082Seschrock { 2982082Seschrock uint64_t *p, *src, pcount, ccount, i; 2992082Seschrock int c; 3002082Seschrock 3012082Seschrock pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 3022082Seschrock 3032082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 3042082Seschrock src = rm->rm_col[c].rc_data; 3052082Seschrock p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 3062082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 3072082Seschrock 3082082Seschrock if (c == rm->rm_firstdatacol) { 3092082Seschrock ASSERT(ccount == pcount); 3102082Seschrock for (i = 0; i < ccount; i++, p++, src++) { 3112082Seschrock *p = *src; 3122082Seschrock } 3132082Seschrock } else { 3142082Seschrock ASSERT(ccount <= pcount); 3152082Seschrock for (i = 0; i < ccount; i++, p++, src++) { 3162082Seschrock *p ^= *src; 3172082Seschrock } 3182082Seschrock } 3192082Seschrock } 3202082Seschrock } 3212082Seschrock 3222082Seschrock static void 3232082Seschrock vdev_raidz_generate_parity_pq(raidz_map_t *rm) 324789Sahrens { 3252082Seschrock uint64_t *q, *p, *src, pcount, ccount, mask, i; 3262082Seschrock int c; 3272082Seschrock 3282082Seschrock pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 3292082Seschrock ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 3302082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_size); 3312082Seschrock 3322082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 3332082Seschrock src = rm->rm_col[c].rc_data; 3342082Seschrock p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 3352082Seschrock q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 3362082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 3372082Seschrock 3382082Seschrock if (c == rm->rm_firstdatacol) { 3392082Seschrock ASSERT(ccount == pcount || ccount == 0); 3402082Seschrock for (i = 0; i < ccount; i++, p++, q++, src++) { 3412082Seschrock *q = *src; 3422082Seschrock *p = *src; 3432082Seschrock } 3442082Seschrock for (; i < pcount; i++, p++, q++, src++) { 3452082Seschrock *q = 0; 3462082Seschrock *p = 0; 3472082Seschrock } 3482082Seschrock } else { 3492082Seschrock ASSERT(ccount <= pcount); 350789Sahrens 3512082Seschrock /* 3522082Seschrock * Rather than multiplying each byte individually (as 3532082Seschrock * described above), we are able to handle 8 at once 3542082Seschrock * by generating a mask based on the high bit in each 3552082Seschrock * byte and using that to conditionally XOR in 0x1d. 3562082Seschrock */ 3572082Seschrock for (i = 0; i < ccount; i++, p++, q++, src++) { 3582082Seschrock mask = *q & 0x8080808080808080ULL; 3592082Seschrock mask = (mask << 1) - (mask >> 7); 3602082Seschrock *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ 3612082Seschrock (mask & 0x1d1d1d1d1d1d1d1dULL); 3622082Seschrock *q ^= *src; 3632082Seschrock *p ^= *src; 3642082Seschrock } 3652082Seschrock 3662082Seschrock /* 3672082Seschrock * Treat short columns as though they are full of 0s. 3682082Seschrock */ 3692082Seschrock for (; i < pcount; i++, q++) { 3702082Seschrock mask = *q & 0x8080808080808080ULL; 3712082Seschrock mask = (mask << 1) - (mask >> 7); 3722082Seschrock *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ 3732082Seschrock (mask & 0x1d1d1d1d1d1d1d1dULL); 3742082Seschrock } 3752082Seschrock } 3762082Seschrock } 3772082Seschrock } 3782082Seschrock 3792082Seschrock static void 3802082Seschrock vdev_raidz_reconstruct_p(raidz_map_t *rm, int x) 3812082Seschrock { 3822082Seschrock uint64_t *dst, *src, xcount, ccount, count, i; 3832082Seschrock int c; 3842082Seschrock 3852082Seschrock xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 3862082Seschrock ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 3872082Seschrock ASSERT(xcount > 0); 3882082Seschrock 3892082Seschrock src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 3902082Seschrock dst = rm->rm_col[x].rc_data; 3912082Seschrock for (i = 0; i < xcount; i++, dst++, src++) { 3922082Seschrock *dst = *src; 3932082Seschrock } 3942082Seschrock 3952082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 396789Sahrens src = rm->rm_col[c].rc_data; 397789Sahrens dst = rm->rm_col[x].rc_data; 3982082Seschrock 3992082Seschrock if (c == x) 4002082Seschrock continue; 4012082Seschrock 4022082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 4032082Seschrock count = MIN(ccount, xcount); 4042082Seschrock 4052082Seschrock for (i = 0; i < count; i++, dst++, src++) { 4062082Seschrock *dst ^= *src; 407789Sahrens } 408789Sahrens } 409789Sahrens } 410789Sahrens 4112082Seschrock static void 4122082Seschrock vdev_raidz_reconstruct_q(raidz_map_t *rm, int x) 4132082Seschrock { 4142082Seschrock uint64_t *dst, *src, xcount, ccount, count, mask, i; 4152082Seschrock uint8_t *b; 4162082Seschrock int c, j, exp; 4172082Seschrock 4182082Seschrock xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 4192082Seschrock ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 4202082Seschrock 4212082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 4222082Seschrock src = rm->rm_col[c].rc_data; 4232082Seschrock dst = rm->rm_col[x].rc_data; 4242082Seschrock 4252082Seschrock if (c == x) 4262082Seschrock ccount = 0; 4272082Seschrock else 4282082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 4292082Seschrock 4302082Seschrock count = MIN(ccount, xcount); 4312082Seschrock 4322082Seschrock if (c == rm->rm_firstdatacol) { 4332082Seschrock for (i = 0; i < count; i++, dst++, src++) { 4342082Seschrock *dst = *src; 4352082Seschrock } 4362082Seschrock for (; i < xcount; i++, dst++) { 4372082Seschrock *dst = 0; 4382082Seschrock } 4392082Seschrock 4402082Seschrock } else { 4412082Seschrock /* 4422082Seschrock * For an explanation of this, see the comment in 4432082Seschrock * vdev_raidz_generate_parity_pq() above. 4442082Seschrock */ 4452082Seschrock for (i = 0; i < count; i++, dst++, src++) { 4462082Seschrock mask = *dst & 0x8080808080808080ULL; 4472082Seschrock mask = (mask << 1) - (mask >> 7); 4482082Seschrock *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ 4492082Seschrock (mask & 0x1d1d1d1d1d1d1d1dULL); 4502082Seschrock *dst ^= *src; 4512082Seschrock } 4522082Seschrock 4532082Seschrock for (; i < xcount; i++, dst++) { 4542082Seschrock mask = *dst & 0x8080808080808080ULL; 4552082Seschrock mask = (mask << 1) - (mask >> 7); 4562082Seschrock *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ 4572082Seschrock (mask & 0x1d1d1d1d1d1d1d1dULL); 4582082Seschrock } 4592082Seschrock } 4602082Seschrock } 4612082Seschrock 4622082Seschrock src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 4632082Seschrock dst = rm->rm_col[x].rc_data; 4642082Seschrock exp = 255 - (rm->rm_cols - 1 - x); 4652082Seschrock 4662082Seschrock for (i = 0; i < xcount; i++, dst++, src++) { 4672082Seschrock *dst ^= *src; 4682082Seschrock for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 4692082Seschrock *b = vdev_raidz_exp2(*b, exp); 4702082Seschrock } 4712082Seschrock } 4722082Seschrock } 4732082Seschrock 4742082Seschrock static void 4752082Seschrock vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y) 4762082Seschrock { 4772082Seschrock uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 4782082Seschrock void *pdata, *qdata; 4792082Seschrock uint64_t xsize, ysize, i; 4802082Seschrock 4812082Seschrock ASSERT(x < y); 4822082Seschrock ASSERT(x >= rm->rm_firstdatacol); 4832082Seschrock ASSERT(y < rm->rm_cols); 4842082Seschrock 4852082Seschrock ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 4862082Seschrock 4872082Seschrock /* 4882082Seschrock * Move the parity data aside -- we're going to compute parity as 4892082Seschrock * though columns x and y were full of zeros -- Pxy and Qxy. We want to 4902082Seschrock * reuse the parity generation mechanism without trashing the actual 4912082Seschrock * parity so we make those columns appear to be full of zeros by 4922082Seschrock * setting their lengths to zero. 4932082Seschrock */ 4942082Seschrock pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 4952082Seschrock qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 4962082Seschrock xsize = rm->rm_col[x].rc_size; 4972082Seschrock ysize = rm->rm_col[y].rc_size; 4982082Seschrock 4992082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_data = 5002082Seschrock zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 5012082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_data = 5022082Seschrock zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 5032082Seschrock rm->rm_col[x].rc_size = 0; 5042082Seschrock rm->rm_col[y].rc_size = 0; 5052082Seschrock 5062082Seschrock vdev_raidz_generate_parity_pq(rm); 5072082Seschrock 5082082Seschrock rm->rm_col[x].rc_size = xsize; 5092082Seschrock rm->rm_col[y].rc_size = ysize; 5102082Seschrock 5112082Seschrock p = pdata; 5122082Seschrock q = qdata; 5132082Seschrock pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 5142082Seschrock qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 5152082Seschrock xd = rm->rm_col[x].rc_data; 5162082Seschrock yd = rm->rm_col[y].rc_data; 5172082Seschrock 5182082Seschrock /* 5192082Seschrock * We now have: 5202082Seschrock * Pxy = P + D_x + D_y 5212082Seschrock * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 5222082Seschrock * 5232082Seschrock * We can then solve for D_x: 5242082Seschrock * D_x = A * (P + Pxy) + B * (Q + Qxy) 5252082Seschrock * where 5262082Seschrock * A = 2^(x - y) * (2^(x - y) + 1)^-1 5272082Seschrock * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 5282082Seschrock * 5292082Seschrock * With D_x in hand, we can easily solve for D_y: 5302082Seschrock * D_y = P + Pxy + D_x 5312082Seschrock */ 5322082Seschrock 5332082Seschrock a = vdev_raidz_pow2[255 + x - y]; 5342082Seschrock b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 5352082Seschrock tmp = 255 - vdev_raidz_log2[a ^ 1]; 5362082Seschrock 5372082Seschrock aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 5382082Seschrock bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 5392082Seschrock 5402082Seschrock for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 5412082Seschrock *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 5422082Seschrock vdev_raidz_exp2(*q ^ *qxy, bexp); 5432082Seschrock 5442082Seschrock if (i < ysize) 5452082Seschrock *yd = *p ^ *pxy ^ *xd; 5462082Seschrock } 5472082Seschrock 5482082Seschrock zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 5492082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_size); 5502082Seschrock zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 5512082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_size); 5522082Seschrock 5532082Seschrock /* 5542082Seschrock * Restore the saved parity data. 5552082Seschrock */ 5562082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 5572082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 5582082Seschrock } 5592082Seschrock 5602082Seschrock 561789Sahrens static int 562789Sahrens vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) 563789Sahrens { 564789Sahrens vdev_t *cvd; 5652082Seschrock uint64_t nparity = vd->vdev_nparity; 566789Sahrens int c, error; 567789Sahrens int lasterror = 0; 568789Sahrens int numerrors = 0; 569789Sahrens 5702082Seschrock ASSERT(nparity > 0); 5712082Seschrock 5722082Seschrock if (nparity > VDEV_RAIDZ_MAXPARITY || 5732082Seschrock vd->vdev_children < nparity + 1) { 574789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 575789Sahrens return (EINVAL); 576789Sahrens } 577789Sahrens 578789Sahrens for (c = 0; c < vd->vdev_children; c++) { 579789Sahrens cvd = vd->vdev_child[c]; 580789Sahrens 581789Sahrens if ((error = vdev_open(cvd)) != 0) { 582789Sahrens lasterror = error; 583789Sahrens numerrors++; 584789Sahrens continue; 585789Sahrens } 586789Sahrens 587789Sahrens *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 5881732Sbonwick *ashift = MAX(*ashift, cvd->vdev_ashift); 589789Sahrens } 590789Sahrens 591789Sahrens *asize *= vd->vdev_children; 592789Sahrens 5932082Seschrock if (numerrors > nparity) { 594789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 595789Sahrens return (lasterror); 596789Sahrens } 597789Sahrens 598789Sahrens return (0); 599789Sahrens } 600789Sahrens 601789Sahrens static void 602789Sahrens vdev_raidz_close(vdev_t *vd) 603789Sahrens { 604789Sahrens int c; 605789Sahrens 606789Sahrens for (c = 0; c < vd->vdev_children; c++) 607789Sahrens vdev_close(vd->vdev_child[c]); 608789Sahrens } 609789Sahrens 610789Sahrens static uint64_t 611789Sahrens vdev_raidz_asize(vdev_t *vd, uint64_t psize) 612789Sahrens { 613789Sahrens uint64_t asize; 6141732Sbonwick uint64_t ashift = vd->vdev_top->vdev_ashift; 615789Sahrens uint64_t cols = vd->vdev_children; 6162082Seschrock uint64_t nparity = vd->vdev_nparity; 617789Sahrens 6181732Sbonwick asize = ((psize - 1) >> ashift) + 1; 6192082Seschrock asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 6202082Seschrock asize = roundup(asize, nparity + 1) << ashift; 621789Sahrens 622789Sahrens return (asize); 623789Sahrens } 624789Sahrens 625789Sahrens static void 626789Sahrens vdev_raidz_child_done(zio_t *zio) 627789Sahrens { 628789Sahrens raidz_col_t *rc = zio->io_private; 629789Sahrens 630789Sahrens rc->rc_error = zio->io_error; 631789Sahrens rc->rc_tried = 1; 632789Sahrens rc->rc_skipped = 0; 633789Sahrens } 634789Sahrens 635789Sahrens static void 636789Sahrens vdev_raidz_repair_done(zio_t *zio) 637789Sahrens { 6381732Sbonwick ASSERT(zio->io_private == zio->io_parent); 6391732Sbonwick vdev_raidz_map_free(zio->io_private); 640789Sahrens } 641789Sahrens 642789Sahrens static void 643789Sahrens vdev_raidz_io_start(zio_t *zio) 644789Sahrens { 645789Sahrens vdev_t *vd = zio->io_vd; 6461732Sbonwick vdev_t *tvd = vd->vdev_top; 647789Sahrens vdev_t *cvd; 648789Sahrens blkptr_t *bp = zio->io_bp; 649789Sahrens raidz_map_t *rm; 650789Sahrens raidz_col_t *rc; 651789Sahrens int c; 652789Sahrens 6532082Seschrock rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, 6542082Seschrock vd->vdev_nparity); 655789Sahrens 6561775Sbillm ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 657789Sahrens 658789Sahrens if (zio->io_type == ZIO_TYPE_WRITE) { 659789Sahrens /* 6602082Seschrock * Generate RAID parity in the first virtual columns. 661789Sahrens */ 6622082Seschrock if (rm->rm_firstdatacol == 1) 6632082Seschrock vdev_raidz_generate_parity_p(rm); 6642082Seschrock else 6652082Seschrock vdev_raidz_generate_parity_pq(rm); 666789Sahrens 667789Sahrens for (c = 0; c < rm->rm_cols; c++) { 668789Sahrens rc = &rm->rm_col[c]; 6692082Seschrock cvd = vd->vdev_child[rc->rc_devidx]; 670789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 671789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 672789Sahrens zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, 673789Sahrens vdev_raidz_child_done, rc)); 674789Sahrens } 675789Sahrens zio_wait_children_done(zio); 676789Sahrens return; 677789Sahrens } 678789Sahrens 679789Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ); 680789Sahrens 6812082Seschrock /* 6822082Seschrock * Iterate over the columns in reverse order so that we hit the parity 6832082Seschrock * last -- any errors along the way will force us to read the parity 6842082Seschrock * data. 6852082Seschrock */ 686789Sahrens for (c = rm->rm_cols - 1; c >= 0; c--) { 687789Sahrens rc = &rm->rm_col[c]; 6882082Seschrock cvd = vd->vdev_child[rc->rc_devidx]; 689789Sahrens if (vdev_is_dead(cvd)) { 6902082Seschrock if (c >= rm->rm_firstdatacol) 6912082Seschrock rm->rm_missingdata++; 6922082Seschrock else 6932082Seschrock rm->rm_missingparity++; 694789Sahrens rc->rc_error = ENXIO; 695789Sahrens rc->rc_tried = 1; /* don't even try */ 696789Sahrens rc->rc_skipped = 1; 697789Sahrens continue; 698789Sahrens } 699789Sahrens if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) { 7002082Seschrock if (c >= rm->rm_firstdatacol) 7012082Seschrock rm->rm_missingdata++; 7022082Seschrock else 7032082Seschrock rm->rm_missingparity++; 704789Sahrens rc->rc_error = ESTALE; 705789Sahrens rc->rc_skipped = 1; 706789Sahrens continue; 707789Sahrens } 7082082Seschrock if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 709789Sahrens (zio->io_flags & ZIO_FLAG_SCRUB)) { 710789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 711789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 712789Sahrens zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, 713789Sahrens vdev_raidz_child_done, rc)); 714789Sahrens } 715789Sahrens } 716789Sahrens 717789Sahrens zio_wait_children_done(zio); 718789Sahrens } 719789Sahrens 7201544Seschrock /* 7211544Seschrock * Report a checksum error for a child of a RAID-Z device. 7221544Seschrock */ 7231544Seschrock static void 7241544Seschrock raidz_checksum_error(zio_t *zio, raidz_col_t *rc) 7251544Seschrock { 7262082Seschrock vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 7271544Seschrock dprintf_bp(zio->io_bp, "imputed checksum error on %s: ", 7281544Seschrock vdev_description(vd)); 7291544Seschrock 7301544Seschrock if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 7311544Seschrock mutex_enter(&vd->vdev_stat_lock); 7321544Seschrock vd->vdev_stat.vs_checksum_errors++; 7331544Seschrock mutex_exit(&vd->vdev_stat_lock); 7341544Seschrock } 7351544Seschrock 7361544Seschrock if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 7371544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 7381544Seschrock zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size); 7391544Seschrock } 7401544Seschrock 7412082Seschrock /* 7422082Seschrock * Generate the parity from the data columns. If we tried and were able to 7432082Seschrock * read the parity without error, verify that the generated parity matches the 7442082Seschrock * data we read. If it doesn't, we fire off a checksum error. Return the 7452082Seschrock * number such failures. 7462082Seschrock */ 7472082Seschrock static int 7482082Seschrock raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 7492082Seschrock { 7502082Seschrock void *orig[VDEV_RAIDZ_MAXPARITY]; 7512082Seschrock int c, ret = 0; 7522082Seschrock raidz_col_t *rc; 7532082Seschrock 7542082Seschrock for (c = 0; c < rm->rm_firstdatacol; c++) { 7552082Seschrock rc = &rm->rm_col[c]; 7562082Seschrock if (!rc->rc_tried || rc->rc_error != 0) 7572082Seschrock continue; 7582082Seschrock orig[c] = zio_buf_alloc(rc->rc_size); 7592082Seschrock bcopy(rc->rc_data, orig[c], rc->rc_size); 7602082Seschrock } 7612082Seschrock 7622082Seschrock if (rm->rm_firstdatacol == 1) 7632082Seschrock vdev_raidz_generate_parity_p(rm); 7642082Seschrock else 7652082Seschrock vdev_raidz_generate_parity_pq(rm); 7662082Seschrock 7672082Seschrock for (c = 0; c < rm->rm_firstdatacol; c++) { 7682082Seschrock rc = &rm->rm_col[c]; 7692082Seschrock if (!rc->rc_tried || rc->rc_error != 0) 7702082Seschrock continue; 7712082Seschrock if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 7722082Seschrock raidz_checksum_error(zio, rc); 7732082Seschrock rc->rc_error = ECKSUM; 7742082Seschrock ret++; 7752082Seschrock } 7762082Seschrock zio_buf_free(orig[c], rc->rc_size); 7772082Seschrock } 7782082Seschrock 7792082Seschrock return (ret); 7802082Seschrock } 7812082Seschrock 7822082Seschrock static uint64_t raidz_corrected_p; 7832082Seschrock static uint64_t raidz_corrected_q; 7842082Seschrock static uint64_t raidz_corrected_pq; 7851544Seschrock 786789Sahrens static void 787789Sahrens vdev_raidz_io_done(zio_t *zio) 788789Sahrens { 789789Sahrens vdev_t *vd = zio->io_vd; 790789Sahrens vdev_t *cvd; 791789Sahrens raidz_map_t *rm = zio->io_vsd; 7922082Seschrock raidz_col_t *rc, *rc1; 793789Sahrens int unexpected_errors = 0; 7942082Seschrock int parity_errors = 0; 7953456Sahl int parity_untried = 0; 7962082Seschrock int data_errors = 0; 7972082Seschrock int n, c, c1; 798789Sahrens 7991775Sbillm ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 800789Sahrens 801789Sahrens zio->io_error = 0; 802789Sahrens zio->io_numerrors = 0; 803789Sahrens 8042082Seschrock ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 8052082Seschrock ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 8062082Seschrock 807789Sahrens for (c = 0; c < rm->rm_cols; c++) { 808789Sahrens rc = &rm->rm_col[c]; 809789Sahrens 810789Sahrens /* 811789Sahrens * We preserve any EIOs because those may be worth retrying; 812789Sahrens * whereas ECKSUM and ENXIO are more likely to be persistent. 813789Sahrens */ 814789Sahrens if (rc->rc_error) { 815789Sahrens if (zio->io_error != EIO) 816789Sahrens zio->io_error = rc->rc_error; 8172082Seschrock 8182082Seschrock if (c < rm->rm_firstdatacol) 8192082Seschrock parity_errors++; 8202082Seschrock else 8212082Seschrock data_errors++; 8222082Seschrock 823789Sahrens if (!rc->rc_skipped) 824789Sahrens unexpected_errors++; 8252082Seschrock 826789Sahrens zio->io_numerrors++; 8273456Sahl } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 8283456Sahl parity_untried++; 829789Sahrens } 830789Sahrens } 831789Sahrens 832789Sahrens if (zio->io_type == ZIO_TYPE_WRITE) { 833789Sahrens /* 834789Sahrens * If this is not a failfast write, and we were able to 835789Sahrens * write enough columns to reconstruct the data, good enough. 836789Sahrens */ 837789Sahrens /* XXPOLICY */ 838789Sahrens if (zio->io_numerrors <= rm->rm_firstdatacol && 839789Sahrens !(zio->io_flags & ZIO_FLAG_FAILFAST)) 840789Sahrens zio->io_error = 0; 841789Sahrens 842789Sahrens vdev_raidz_map_free(zio); 843789Sahrens zio_next_stage(zio); 844789Sahrens return; 845789Sahrens } 846789Sahrens 847789Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ); 8482082Seschrock /* 8492082Seschrock * There are three potential phases for a read: 8502082Seschrock * 1. produce valid data from the columns read 8512082Seschrock * 2. read all disks and try again 8522082Seschrock * 3. perform combinatorial reconstruction 8532082Seschrock * 8542082Seschrock * Each phase is progressively both more expensive and less likely to 8552082Seschrock * occur. If we encounter more errors than we can repair or all phases 8562082Seschrock * fail, we have no choice but to return an error. 8572082Seschrock */ 858789Sahrens 859789Sahrens /* 8602082Seschrock * If the number of errors we saw was correctable -- less than or equal 8613456Sahl * to the number of parity disks read -- attempt to produce data that 8623456Sahl * has a valid checksum. Naturally, this case applies in the absence of 8633456Sahl * any errors. 864789Sahrens */ 8653456Sahl if (zio->io_numerrors <= rm->rm_firstdatacol - parity_untried) { 8662082Seschrock switch (data_errors) { 8672082Seschrock case 0: 8682082Seschrock if (zio_checksum_error(zio) == 0) { 8692082Seschrock zio->io_error = 0; 870*4034Sahl 871*4034Sahl /* 872*4034Sahl * If we read parity information (unnecessarily 873*4034Sahl * as it happens since no reconstruction was 874*4034Sahl * needed) regenerate and verify the parity. 875*4034Sahl * We also regenerate parity when resilvering 876*4034Sahl * so we can write it out to the failed device 877*4034Sahl * later. 878*4034Sahl */ 8793456Sahl if (parity_errors + parity_untried < 880*4034Sahl rm->rm_firstdatacol || 881*4034Sahl (zio->io_flags & ZIO_FLAG_RESILVER)) { 8823456Sahl n = raidz_parity_verify(zio, rm); 8833456Sahl unexpected_errors += n; 8843456Sahl ASSERT(parity_errors + n <= 8853456Sahl rm->rm_firstdatacol); 8863456Sahl } 8872082Seschrock goto done; 8882082Seschrock } 8892082Seschrock break; 8902082Seschrock 8912082Seschrock case 1: 8923456Sahl /* 8933456Sahl * We either attempt to read all the parity columns or 8943456Sahl * none of them. If we didn't try to read parity, we 8953456Sahl * wouldn't be here in the correctable case. There must 8963456Sahl * also have been fewer parity errors than parity 8973456Sahl * columns or, again, we wouldn't be in this code path. 8983456Sahl */ 8993456Sahl ASSERT(parity_untried == 0); 9002082Seschrock ASSERT(parity_errors < rm->rm_firstdatacol); 9012082Seschrock 9022082Seschrock /* 9032082Seschrock * Find the column that reported the error. 9042082Seschrock */ 9052082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 9062082Seschrock rc = &rm->rm_col[c]; 9072082Seschrock if (rc->rc_error != 0) 9082082Seschrock break; 9092082Seschrock } 9102082Seschrock ASSERT(c != rm->rm_cols); 9112082Seschrock ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || 9122082Seschrock rc->rc_error == ESTALE); 9132082Seschrock 9142082Seschrock if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { 9152082Seschrock vdev_raidz_reconstruct_p(rm, c); 9162082Seschrock } else { 9172082Seschrock ASSERT(rm->rm_firstdatacol > 1); 9182082Seschrock vdev_raidz_reconstruct_q(rm, c); 9192082Seschrock } 9202082Seschrock 9212082Seschrock if (zio_checksum_error(zio) == 0) { 9222082Seschrock zio->io_error = 0; 9232082Seschrock if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) 9242082Seschrock atomic_inc_64(&raidz_corrected_p); 9252082Seschrock else 9262082Seschrock atomic_inc_64(&raidz_corrected_q); 927789Sahrens 9282082Seschrock /* 9293456Sahl * If there's more than one parity disk that 9303456Sahl * was successfully read, confirm that the 9313456Sahl * other parity disk produced the correct data. 9323456Sahl * This routine is suboptimal in that it 9333456Sahl * regenerates both the parity we wish to test 9343456Sahl * as well as the parity we just used to 9353456Sahl * perform the reconstruction, but this should 9363456Sahl * be a relatively uncommon case, and can be 9373456Sahl * optimized if it becomes a problem. 938*4034Sahl * We also regenerate parity when resilvering 939*4034Sahl * so we can write it out to the failed device 940*4034Sahl * later. 9412082Seschrock */ 942*4034Sahl if (parity_errors < rm->rm_firstdatacol - 1 || 943*4034Sahl (zio->io_flags & ZIO_FLAG_RESILVER)) { 9442082Seschrock n = raidz_parity_verify(zio, rm); 9452082Seschrock unexpected_errors += n; 9462082Seschrock ASSERT(parity_errors + n <= 9472082Seschrock rm->rm_firstdatacol); 9482082Seschrock } 9492082Seschrock 9502082Seschrock goto done; 9512082Seschrock } 9522082Seschrock break; 9532082Seschrock 9542082Seschrock case 2: 9552082Seschrock /* 9563456Sahl * Two data column errors require double parity. 9573456Sahl */ 9583456Sahl ASSERT(rm->rm_firstdatacol == 2); 9593456Sahl 9603456Sahl /* 9612082Seschrock * Find the two columns that reported errors. 9622082Seschrock */ 9632082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 9642082Seschrock rc = &rm->rm_col[c]; 9652082Seschrock if (rc->rc_error != 0) 9662082Seschrock break; 967789Sahrens } 9682082Seschrock ASSERT(c != rm->rm_cols); 9692082Seschrock ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || 9702082Seschrock rc->rc_error == ESTALE); 9712082Seschrock 9722082Seschrock for (c1 = c++; c < rm->rm_cols; c++) { 9732082Seschrock rc = &rm->rm_col[c]; 9742082Seschrock if (rc->rc_error != 0) 9752082Seschrock break; 9762082Seschrock } 9772082Seschrock ASSERT(c != rm->rm_cols); 9782082Seschrock ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || 9792082Seschrock rc->rc_error == ESTALE); 980789Sahrens 9812082Seschrock vdev_raidz_reconstruct_pq(rm, c1, c); 9822082Seschrock 9832082Seschrock if (zio_checksum_error(zio) == 0) { 9842082Seschrock zio->io_error = 0; 9852082Seschrock atomic_inc_64(&raidz_corrected_pq); 9862082Seschrock 9872082Seschrock goto done; 9882082Seschrock } 9892082Seschrock break; 9902082Seschrock 9912082Seschrock default: 9922082Seschrock ASSERT(rm->rm_firstdatacol <= 2); 9932082Seschrock ASSERT(0); 994789Sahrens } 995789Sahrens } 996789Sahrens 997789Sahrens /* 9982082Seschrock * This isn't a typical situation -- either we got a read error or 9992082Seschrock * a child silently returned bad data. Read every block so we can 10002082Seschrock * try again with as much data and parity as we can track down. If 10012082Seschrock * we've already been through once before, all children will be marked 10022082Seschrock * as tried so we'll proceed to combinatorial reconstruction. 1003789Sahrens */ 1004789Sahrens unexpected_errors = 1; 10052082Seschrock rm->rm_missingdata = 0; 10062082Seschrock rm->rm_missingparity = 0; 1007789Sahrens 10082082Seschrock for (c = 0; c < rm->rm_cols; c++) { 10092082Seschrock if (rm->rm_col[c].rc_tried) 10102082Seschrock continue; 1011789Sahrens 1012789Sahrens zio->io_error = 0; 1013789Sahrens zio_vdev_io_redone(zio); 10142082Seschrock do { 1015789Sahrens rc = &rm->rm_col[c]; 1016789Sahrens if (rc->rc_tried) 1017789Sahrens continue; 1018789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, 10192082Seschrock vd->vdev_child[rc->rc_devidx], 1020789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 1021789Sahrens zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, 1022789Sahrens vdev_raidz_child_done, rc)); 10232082Seschrock } while (++c < rm->rm_cols); 10242082Seschrock dprintf("rereading\n"); 1025789Sahrens zio_wait_children_done(zio); 1026789Sahrens return; 1027789Sahrens } 1028789Sahrens 1029789Sahrens /* 10302082Seschrock * At this point we've attempted to reconstruct the data given the 10312082Seschrock * errors we detected, and we've attempted to read all columns. There 10322082Seschrock * must, therefore, be one or more additional problems -- silent errors 10332082Seschrock * resulting in invalid data rather than explicit I/O errors resulting 10342082Seschrock * in absent data. Before we attempt combinatorial reconstruction make 10352082Seschrock * sure we have a chance of coming up with the right answer. 1036789Sahrens */ 10372082Seschrock if (zio->io_numerrors >= rm->rm_firstdatacol) { 1038789Sahrens ASSERT(zio->io_error != 0); 1039789Sahrens goto done; 1040789Sahrens } 1041789Sahrens 10422082Seschrock if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { 10432082Seschrock /* 10442082Seschrock * Attempt to reconstruct the data from parity P. 10452082Seschrock */ 10462082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 10472082Seschrock void *orig; 10482082Seschrock rc = &rm->rm_col[c]; 10492082Seschrock 10502082Seschrock orig = zio_buf_alloc(rc->rc_size); 10512082Seschrock bcopy(rc->rc_data, orig, rc->rc_size); 10522082Seschrock vdev_raidz_reconstruct_p(rm, c); 10532082Seschrock 10542082Seschrock if (zio_checksum_error(zio) == 0) { 10552082Seschrock zio_buf_free(orig, rc->rc_size); 10562082Seschrock zio->io_error = 0; 10572082Seschrock atomic_inc_64(&raidz_corrected_p); 10582082Seschrock 10592082Seschrock /* 10602082Seschrock * If this child didn't know that it returned 10612082Seschrock * bad data, inform it. 10622082Seschrock */ 10632082Seschrock if (rc->rc_tried && rc->rc_error == 0) 10642082Seschrock raidz_checksum_error(zio, rc); 10652082Seschrock rc->rc_error = ECKSUM; 10662082Seschrock goto done; 10672082Seschrock } 10682082Seschrock 10692082Seschrock bcopy(orig, rc->rc_data, rc->rc_size); 10702082Seschrock zio_buf_free(orig, rc->rc_size); 10712082Seschrock } 10722082Seschrock } 10732082Seschrock 10742082Seschrock if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { 10752082Seschrock /* 10762082Seschrock * Attempt to reconstruct the data from parity Q. 10772082Seschrock */ 10782082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 10792082Seschrock void *orig; 10802082Seschrock rc = &rm->rm_col[c]; 10812082Seschrock 10822082Seschrock orig = zio_buf_alloc(rc->rc_size); 10832082Seschrock bcopy(rc->rc_data, orig, rc->rc_size); 10842082Seschrock vdev_raidz_reconstruct_q(rm, c); 10852082Seschrock 10862082Seschrock if (zio_checksum_error(zio) == 0) { 10872082Seschrock zio_buf_free(orig, rc->rc_size); 1088789Sahrens zio->io_error = 0; 10892082Seschrock atomic_inc_64(&raidz_corrected_q); 10902082Seschrock 10912082Seschrock /* 10922082Seschrock * If this child didn't know that it returned 10932082Seschrock * bad data, inform it. 10942082Seschrock */ 10952082Seschrock if (rc->rc_tried && rc->rc_error == 0) 10962082Seschrock raidz_checksum_error(zio, rc); 10972082Seschrock rc->rc_error = ECKSUM; 10982082Seschrock goto done; 10992082Seschrock } 11002082Seschrock 11012082Seschrock bcopy(orig, rc->rc_data, rc->rc_size); 11022082Seschrock zio_buf_free(orig, rc->rc_size); 11032082Seschrock } 11042082Seschrock } 11052082Seschrock 11062082Seschrock if (rm->rm_firstdatacol > 1 && 11072082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 && 11082082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { 11092082Seschrock /* 11102082Seschrock * Attempt to reconstruct the data from both P and Q. 11112082Seschrock */ 11122082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) { 11132082Seschrock void *orig, *orig1; 11142082Seschrock rc = &rm->rm_col[c]; 11152082Seschrock 11162082Seschrock orig = zio_buf_alloc(rc->rc_size); 11172082Seschrock bcopy(rc->rc_data, orig, rc->rc_size); 11182082Seschrock 11192082Seschrock for (c1 = c + 1; c1 < rm->rm_cols; c1++) { 11202082Seschrock rc1 = &rm->rm_col[c1]; 11212082Seschrock 11222082Seschrock orig1 = zio_buf_alloc(rc1->rc_size); 11232082Seschrock bcopy(rc1->rc_data, orig1, rc1->rc_size); 11242082Seschrock 11252082Seschrock vdev_raidz_reconstruct_pq(rm, c, c1); 11262082Seschrock 11272082Seschrock if (zio_checksum_error(zio) == 0) { 11282082Seschrock zio_buf_free(orig, rc->rc_size); 11292082Seschrock zio_buf_free(orig1, rc1->rc_size); 11302082Seschrock zio->io_error = 0; 11312082Seschrock atomic_inc_64(&raidz_corrected_pq); 11322082Seschrock 11332082Seschrock /* 11342082Seschrock * If these children didn't know they 11352082Seschrock * returned bad data, inform them. 11362082Seschrock */ 11372082Seschrock if (rc->rc_tried && rc->rc_error == 0) 11382082Seschrock raidz_checksum_error(zio, rc); 11392082Seschrock if (rc1->rc_tried && rc1->rc_error == 0) 11402082Seschrock raidz_checksum_error(zio, rc1); 11412082Seschrock 11422082Seschrock rc->rc_error = ECKSUM; 11432082Seschrock rc1->rc_error = ECKSUM; 11442082Seschrock 11452082Seschrock goto done; 11462082Seschrock } 11472082Seschrock 11482082Seschrock bcopy(orig1, rc1->rc_data, rc1->rc_size); 11492082Seschrock zio_buf_free(orig1, rc1->rc_size); 11502082Seschrock } 11512082Seschrock 11522082Seschrock bcopy(orig, rc->rc_data, rc->rc_size); 11532082Seschrock zio_buf_free(orig, rc->rc_size); 1154789Sahrens } 1155789Sahrens } 1156789Sahrens 1157789Sahrens /* 11582082Seschrock * All combinations failed to checksum. Generate checksum ereports for 11592082Seschrock * all children. 1160789Sahrens */ 1161789Sahrens zio->io_error = ECKSUM; 11621544Seschrock if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 11631544Seschrock for (c = 0; c < rm->rm_cols; c++) { 11641544Seschrock rc = &rm->rm_col[c]; 11651544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 11662082Seschrock zio->io_spa, vd->vdev_child[rc->rc_devidx], zio, 11671544Seschrock rc->rc_offset, rc->rc_size); 11681544Seschrock } 11691544Seschrock } 1170789Sahrens 1171789Sahrens done: 1172789Sahrens zio_checksum_verified(zio); 1173789Sahrens 1174789Sahrens if (zio->io_error == 0 && (spa_mode & FWRITE) && 1175789Sahrens (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 11761732Sbonwick zio_t *rio; 11771732Sbonwick 1178789Sahrens /* 1179789Sahrens * Use the good data we have in hand to repair damaged children. 11801732Sbonwick * 11811732Sbonwick * We issue all repair I/Os as children of 'rio' to arrange 11821732Sbonwick * that vdev_raidz_map_free(zio) will be invoked after all 11831732Sbonwick * repairs complete, but before we advance to the next stage. 1184789Sahrens */ 11851732Sbonwick rio = zio_null(zio, zio->io_spa, 11861732Sbonwick vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL); 11871732Sbonwick 1188789Sahrens for (c = 0; c < rm->rm_cols; c++) { 1189789Sahrens rc = &rm->rm_col[c]; 11902082Seschrock cvd = vd->vdev_child[rc->rc_devidx]; 1191789Sahrens 11921732Sbonwick if (rc->rc_error == 0) 11931732Sbonwick continue; 11941732Sbonwick 11951732Sbonwick dprintf("%s resilvered %s @ 0x%llx error %d\n", 11961732Sbonwick vdev_description(vd), 11971732Sbonwick vdev_description(cvd), 11981732Sbonwick zio->io_offset, rc->rc_error); 1199789Sahrens 12001732Sbonwick zio_nowait(zio_vdev_child_io(rio, NULL, cvd, 12011732Sbonwick rc->rc_offset, rc->rc_data, rc->rc_size, 12021732Sbonwick ZIO_TYPE_WRITE, zio->io_priority, 12032082Seschrock ZIO_FLAG_IO_REPAIR | ZIO_FLAG_DONT_PROPAGATE | 12042082Seschrock ZIO_FLAG_CANFAIL, NULL, NULL)); 12051732Sbonwick } 1206789Sahrens 12071732Sbonwick zio_nowait(rio); 12081732Sbonwick zio_wait_children_done(zio); 12091732Sbonwick return; 1210789Sahrens } 1211789Sahrens 1212789Sahrens vdev_raidz_map_free(zio); 1213789Sahrens zio_next_stage(zio); 1214789Sahrens } 1215789Sahrens 1216789Sahrens static void 1217789Sahrens vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 1218789Sahrens { 12192082Seschrock if (faulted > vd->vdev_nparity) 12201544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 12211544Seschrock VDEV_AUX_NO_REPLICAS); 1222789Sahrens else if (degraded + faulted != 0) 12231544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 1224789Sahrens else 12251544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 1226789Sahrens } 1227789Sahrens 1228789Sahrens vdev_ops_t vdev_raidz_ops = { 1229789Sahrens vdev_raidz_open, 1230789Sahrens vdev_raidz_close, 1231789Sahrens vdev_raidz_asize, 1232789Sahrens vdev_raidz_io_start, 1233789Sahrens vdev_raidz_io_done, 1234789Sahrens vdev_raidz_state_change, 1235789Sahrens VDEV_TYPE_RAIDZ, /* name of this vdev type */ 1236789Sahrens B_FALSE /* not a leaf vdev */ 1237789Sahrens }; 1238