1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 212082Seschrock 22789Sahrens /* 23*3456Sahl * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24789Sahrens * Use is subject to license terms. 25789Sahrens */ 26789Sahrens 27789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28789Sahrens 29789Sahrens #include <sys/zfs_context.h> 30789Sahrens #include <sys/spa.h> 31789Sahrens #include <sys/vdev_impl.h> 32789Sahrens #include <sys/zio.h> 33789Sahrens #include <sys/zio_checksum.h> 34789Sahrens #include <sys/fs/zfs.h> 351544Seschrock #include <sys/fm/fs/zfs.h> 36789Sahrens 37789Sahrens /* 38789Sahrens * Virtual device vector for RAID-Z. 392082Seschrock * 402082Seschrock * This vdev supports both single and double parity. For single parity, we 412082Seschrock * use a simple XOR of all the data columns. For double parity, we use both 422082Seschrock * the simple XOR as well as a technique described in "The mathematics of 432082Seschrock * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8), 442082Seschrock * over the integers expressable in a single byte. Briefly, the operations on 452082Seschrock * the field are defined as follows: 462082Seschrock * 472082Seschrock * o addition (+) is represented by a bitwise XOR 482082Seschrock * o subtraction (-) is therefore identical to addition: A + B = A - B 492082Seschrock * o multiplication of A by 2 is defined by the following bitwise expression: 502082Seschrock * (A * 2)_7 = A_6 512082Seschrock * (A * 2)_6 = A_5 522082Seschrock * (A * 2)_5 = A_4 532082Seschrock * (A * 2)_4 = A_3 + A_7 542082Seschrock * (A * 2)_3 = A_2 + A_7 552082Seschrock * (A * 2)_2 = A_1 + A_7 562082Seschrock * (A * 2)_1 = A_0 572082Seschrock * (A * 2)_0 = A_7 582082Seschrock * 592082Seschrock * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 602082Seschrock * 612082Seschrock * Observe that any number in the field (except for 0) can be expressed as a 622082Seschrock * power of 2 -- a generator for the field. We store a table of the powers of 632082Seschrock * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 642082Seschrock * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 652082Seschrock * than field addition). The inverse of a field element A (A^-1) is A^254. 662082Seschrock * 672082Seschrock * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1, 682082Seschrock * can be expressed by field operations: 692082Seschrock * 702082Seschrock * P = D_0 + D_1 + ... + D_n-2 + D_n-1 712082Seschrock * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 722082Seschrock * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 732082Seschrock * 742082Seschrock * See the reconstruction code below for how P and Q can used individually or 752082Seschrock * in concert to recover missing data columns. 76789Sahrens */ 77789Sahrens 78789Sahrens typedef struct raidz_col { 792082Seschrock uint64_t rc_devidx; /* child device index for I/O */ 802082Seschrock uint64_t rc_offset; /* device offset */ 812082Seschrock uint64_t rc_size; /* I/O size */ 822082Seschrock void *rc_data; /* I/O data */ 832082Seschrock int rc_error; /* I/O error for this device */ 842082Seschrock uint8_t rc_tried; /* Did we attempt this I/O column? */ 852082Seschrock uint8_t rc_skipped; /* Did we skip this I/O column? */ 86789Sahrens } raidz_col_t; 87789Sahrens 88789Sahrens typedef struct raidz_map { 892082Seschrock uint64_t rm_cols; /* Column count */ 902082Seschrock uint64_t rm_bigcols; /* Number of oversized columns */ 912082Seschrock uint64_t rm_asize; /* Actual total I/O size */ 922082Seschrock uint64_t rm_missingdata; /* Count of missing data devices */ 932082Seschrock uint64_t rm_missingparity; /* Count of missing parity devices */ 942082Seschrock uint64_t rm_firstdatacol; /* First data column/parity count */ 952082Seschrock raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 96789Sahrens } raidz_map_t; 97789Sahrens 982082Seschrock #define VDEV_RAIDZ_P 0 992082Seschrock #define VDEV_RAIDZ_Q 1 1002082Seschrock 1012082Seschrock #define VDEV_RAIDZ_MAXPARITY 2 1022082Seschrock 1032082Seschrock #define VDEV_RAIDZ_MUL_2(a) (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0)) 1042082Seschrock 1052082Seschrock /* 1062082Seschrock * These two tables represent powers and logs of 2 in the Galois field defined 1072082Seschrock * above. These values were computed by repeatedly multiplying by 2 as above. 1082082Seschrock */ 1092082Seschrock static const uint8_t vdev_raidz_pow2[256] = { 1102082Seschrock 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 1112082Seschrock 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 1122082Seschrock 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 1132082Seschrock 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 1142082Seschrock 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 1152082Seschrock 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 1162082Seschrock 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 1172082Seschrock 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 1182082Seschrock 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 1192082Seschrock 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 1202082Seschrock 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 1212082Seschrock 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 1222082Seschrock 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 1232082Seschrock 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 1242082Seschrock 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 1252082Seschrock 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 1262082Seschrock 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 1272082Seschrock 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 1282082Seschrock 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 1292082Seschrock 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 1302082Seschrock 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 1312082Seschrock 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 1322082Seschrock 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 1332082Seschrock 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 1342082Seschrock 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 1352082Seschrock 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 1362082Seschrock 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 1372082Seschrock 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 1382082Seschrock 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 1392082Seschrock 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 1402082Seschrock 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 1412082Seschrock 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 1422082Seschrock }; 1432082Seschrock static const uint8_t vdev_raidz_log2[256] = { 1442082Seschrock 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 1452082Seschrock 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 1462082Seschrock 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 1472082Seschrock 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 1482082Seschrock 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 1492082Seschrock 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 1502082Seschrock 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 1512082Seschrock 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 1522082Seschrock 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 1532082Seschrock 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 1542082Seschrock 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 1552082Seschrock 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 1562082Seschrock 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 1572082Seschrock 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 1582082Seschrock 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 1592082Seschrock 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 1602082Seschrock 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 1612082Seschrock 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 1622082Seschrock 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 1632082Seschrock 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 1642082Seschrock 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 1652082Seschrock 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 1662082Seschrock 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 1672082Seschrock 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 1682082Seschrock 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 1692082Seschrock 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 1702082Seschrock 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 1712082Seschrock 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 1722082Seschrock 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 1732082Seschrock 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 1742082Seschrock 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 1752082Seschrock 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 1762082Seschrock }; 1772082Seschrock 1782082Seschrock /* 1792082Seschrock * Multiply a given number by 2 raised to the given power. 1802082Seschrock */ 1812082Seschrock static uint8_t 1822082Seschrock vdev_raidz_exp2(uint_t a, int exp) 1832082Seschrock { 1842082Seschrock if (a == 0) 1852082Seschrock return (0); 1862082Seschrock 1872082Seschrock ASSERT(exp >= 0); 1882082Seschrock ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 1892082Seschrock 1902082Seschrock exp += vdev_raidz_log2[a]; 1912082Seschrock if (exp > 255) 1922082Seschrock exp -= 255; 1932082Seschrock 1942082Seschrock return (vdev_raidz_pow2[exp]); 1952082Seschrock } 1962082Seschrock 197789Sahrens static raidz_map_t * 1982082Seschrock vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, 1992082Seschrock uint64_t nparity) 200789Sahrens { 201789Sahrens raidz_map_t *rm; 202789Sahrens uint64_t b = zio->io_offset >> unit_shift; 203789Sahrens uint64_t s = zio->io_size >> unit_shift; 204789Sahrens uint64_t f = b % dcols; 205789Sahrens uint64_t o = (b / dcols) << unit_shift; 2062082Seschrock uint64_t q, r, c, bc, col, acols, coff, devidx; 207789Sahrens 2082082Seschrock q = s / (dcols - nparity); 2092082Seschrock r = s - q * (dcols - nparity); 2102082Seschrock bc = (r == 0 ? 0 : r + nparity); 211789Sahrens 212789Sahrens acols = (q == 0 ? bc : dcols); 213789Sahrens 214789Sahrens rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP); 215789Sahrens 216789Sahrens rm->rm_cols = acols; 217789Sahrens rm->rm_bigcols = bc; 218789Sahrens rm->rm_asize = 0; 2192082Seschrock rm->rm_missingdata = 0; 2202082Seschrock rm->rm_missingparity = 0; 2212082Seschrock rm->rm_firstdatacol = nparity; 222789Sahrens 223789Sahrens for (c = 0; c < acols; c++) { 224789Sahrens col = f + c; 225789Sahrens coff = o; 226789Sahrens if (col >= dcols) { 227789Sahrens col -= dcols; 228789Sahrens coff += 1ULL << unit_shift; 229789Sahrens } 2302082Seschrock rm->rm_col[c].rc_devidx = col; 231789Sahrens rm->rm_col[c].rc_offset = coff; 232789Sahrens rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift; 233789Sahrens rm->rm_col[c].rc_data = NULL; 234789Sahrens rm->rm_col[c].rc_error = 0; 235789Sahrens rm->rm_col[c].rc_tried = 0; 236789Sahrens rm->rm_col[c].rc_skipped = 0; 237789Sahrens rm->rm_asize += rm->rm_col[c].rc_size; 238789Sahrens } 239789Sahrens 2402082Seschrock rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift); 241789Sahrens 242789Sahrens for (c = 0; c < rm->rm_firstdatacol; c++) 243789Sahrens rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); 244789Sahrens 245789Sahrens rm->rm_col[c].rc_data = zio->io_data; 246789Sahrens 247789Sahrens for (c = c + 1; c < acols; c++) 248789Sahrens rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 249789Sahrens rm->rm_col[c - 1].rc_size; 250789Sahrens 2511133Seschrock /* 2522082Seschrock * If all data stored spans all columns, there's a danger that parity 2532082Seschrock * will always be on the same device and, since parity isn't read 2542082Seschrock * during normal operation, that that device's I/O bandwidth won't be 2552082Seschrock * used effectively. We therefore switch the parity every 1MB. 2562082Seschrock * 2572082Seschrock * ... at least that was, ostensibly, the theory. As a practical 2582082Seschrock * matter unless we juggle the parity between all devices evenly, we 2592082Seschrock * won't see any benefit. Further, occasional writes that aren't a 2602082Seschrock * multiple of the LCM of the number of children and the minimum 2612082Seschrock * stripe width are sufficient to avoid pessimal behavior. 2622082Seschrock * Unfortunately, this decision created an implicit on-disk format 263*3456Sahl * requirement that we need to support for all eternity, but only 264*3456Sahl * for single-parity RAID-Z. 2651133Seschrock */ 2661133Seschrock ASSERT(rm->rm_cols >= 2); 2671133Seschrock ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 268789Sahrens 2692082Seschrock if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 2702082Seschrock devidx = rm->rm_col[0].rc_devidx; 2711133Seschrock o = rm->rm_col[0].rc_offset; 2722082Seschrock rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 2731133Seschrock rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 2742082Seschrock rm->rm_col[1].rc_devidx = devidx; 2751133Seschrock rm->rm_col[1].rc_offset = o; 276789Sahrens } 277789Sahrens 278789Sahrens zio->io_vsd = rm; 279789Sahrens return (rm); 280789Sahrens } 281789Sahrens 282789Sahrens static void 283789Sahrens vdev_raidz_map_free(zio_t *zio) 284789Sahrens { 285789Sahrens raidz_map_t *rm = zio->io_vsd; 286789Sahrens int c; 287789Sahrens 288789Sahrens for (c = 0; c < rm->rm_firstdatacol; c++) 289789Sahrens zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 290789Sahrens 291789Sahrens kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols])); 292789Sahrens zio->io_vsd = NULL; 293789Sahrens } 294789Sahrens 295789Sahrens static void 2962082Seschrock vdev_raidz_generate_parity_p(raidz_map_t *rm) 2972082Seschrock { 2982082Seschrock uint64_t *p, *src, pcount, ccount, i; 2992082Seschrock int c; 3002082Seschrock 3012082Seschrock pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 3022082Seschrock 3032082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 3042082Seschrock src = rm->rm_col[c].rc_data; 3052082Seschrock p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 3062082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 3072082Seschrock 3082082Seschrock if (c == rm->rm_firstdatacol) { 3092082Seschrock ASSERT(ccount == pcount); 3102082Seschrock for (i = 0; i < ccount; i++, p++, src++) { 3112082Seschrock *p = *src; 3122082Seschrock } 3132082Seschrock } else { 3142082Seschrock ASSERT(ccount <= pcount); 3152082Seschrock for (i = 0; i < ccount; i++, p++, src++) { 3162082Seschrock *p ^= *src; 3172082Seschrock } 3182082Seschrock } 3192082Seschrock } 3202082Seschrock } 3212082Seschrock 3222082Seschrock static void 3232082Seschrock vdev_raidz_generate_parity_pq(raidz_map_t *rm) 324789Sahrens { 3252082Seschrock uint64_t *q, *p, *src, pcount, ccount, mask, i; 3262082Seschrock int c; 3272082Seschrock 3282082Seschrock pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 3292082Seschrock ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 3302082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_size); 3312082Seschrock 3322082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 3332082Seschrock src = rm->rm_col[c].rc_data; 3342082Seschrock p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 3352082Seschrock q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 3362082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 3372082Seschrock 3382082Seschrock if (c == rm->rm_firstdatacol) { 3392082Seschrock ASSERT(ccount == pcount || ccount == 0); 3402082Seschrock for (i = 0; i < ccount; i++, p++, q++, src++) { 3412082Seschrock *q = *src; 3422082Seschrock *p = *src; 3432082Seschrock } 3442082Seschrock for (; i < pcount; i++, p++, q++, src++) { 3452082Seschrock *q = 0; 3462082Seschrock *p = 0; 3472082Seschrock } 3482082Seschrock } else { 3492082Seschrock ASSERT(ccount <= pcount); 350789Sahrens 3512082Seschrock /* 3522082Seschrock * Rather than multiplying each byte individually (as 3532082Seschrock * described above), we are able to handle 8 at once 3542082Seschrock * by generating a mask based on the high bit in each 3552082Seschrock * byte and using that to conditionally XOR in 0x1d. 3562082Seschrock */ 3572082Seschrock for (i = 0; i < ccount; i++, p++, q++, src++) { 3582082Seschrock mask = *q & 0x8080808080808080ULL; 3592082Seschrock mask = (mask << 1) - (mask >> 7); 3602082Seschrock *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ 3612082Seschrock (mask & 0x1d1d1d1d1d1d1d1dULL); 3622082Seschrock *q ^= *src; 3632082Seschrock *p ^= *src; 3642082Seschrock } 3652082Seschrock 3662082Seschrock /* 3672082Seschrock * Treat short columns as though they are full of 0s. 3682082Seschrock */ 3692082Seschrock for (; i < pcount; i++, q++) { 3702082Seschrock mask = *q & 0x8080808080808080ULL; 3712082Seschrock mask = (mask << 1) - (mask >> 7); 3722082Seschrock *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ 3732082Seschrock (mask & 0x1d1d1d1d1d1d1d1dULL); 3742082Seschrock } 3752082Seschrock } 3762082Seschrock } 3772082Seschrock } 3782082Seschrock 3792082Seschrock static void 3802082Seschrock vdev_raidz_reconstruct_p(raidz_map_t *rm, int x) 3812082Seschrock { 3822082Seschrock uint64_t *dst, *src, xcount, ccount, count, i; 3832082Seschrock int c; 3842082Seschrock 3852082Seschrock xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 3862082Seschrock ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 3872082Seschrock ASSERT(xcount > 0); 3882082Seschrock 3892082Seschrock src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 3902082Seschrock dst = rm->rm_col[x].rc_data; 3912082Seschrock for (i = 0; i < xcount; i++, dst++, src++) { 3922082Seschrock *dst = *src; 3932082Seschrock } 3942082Seschrock 3952082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 396789Sahrens src = rm->rm_col[c].rc_data; 397789Sahrens dst = rm->rm_col[x].rc_data; 3982082Seschrock 3992082Seschrock if (c == x) 4002082Seschrock continue; 4012082Seschrock 4022082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 4032082Seschrock count = MIN(ccount, xcount); 4042082Seschrock 4052082Seschrock for (i = 0; i < count; i++, dst++, src++) { 4062082Seschrock *dst ^= *src; 407789Sahrens } 408789Sahrens } 409789Sahrens } 410789Sahrens 4112082Seschrock static void 4122082Seschrock vdev_raidz_reconstruct_q(raidz_map_t *rm, int x) 4132082Seschrock { 4142082Seschrock uint64_t *dst, *src, xcount, ccount, count, mask, i; 4152082Seschrock uint8_t *b; 4162082Seschrock int c, j, exp; 4172082Seschrock 4182082Seschrock xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 4192082Seschrock ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 4202082Seschrock 4212082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 4222082Seschrock src = rm->rm_col[c].rc_data; 4232082Seschrock dst = rm->rm_col[x].rc_data; 4242082Seschrock 4252082Seschrock if (c == x) 4262082Seschrock ccount = 0; 4272082Seschrock else 4282082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 4292082Seschrock 4302082Seschrock count = MIN(ccount, xcount); 4312082Seschrock 4322082Seschrock if (c == rm->rm_firstdatacol) { 4332082Seschrock for (i = 0; i < count; i++, dst++, src++) { 4342082Seschrock *dst = *src; 4352082Seschrock } 4362082Seschrock for (; i < xcount; i++, dst++) { 4372082Seschrock *dst = 0; 4382082Seschrock } 4392082Seschrock 4402082Seschrock } else { 4412082Seschrock /* 4422082Seschrock * For an explanation of this, see the comment in 4432082Seschrock * vdev_raidz_generate_parity_pq() above. 4442082Seschrock */ 4452082Seschrock for (i = 0; i < count; i++, dst++, src++) { 4462082Seschrock mask = *dst & 0x8080808080808080ULL; 4472082Seschrock mask = (mask << 1) - (mask >> 7); 4482082Seschrock *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ 4492082Seschrock (mask & 0x1d1d1d1d1d1d1d1dULL); 4502082Seschrock *dst ^= *src; 4512082Seschrock } 4522082Seschrock 4532082Seschrock for (; i < xcount; i++, dst++) { 4542082Seschrock mask = *dst & 0x8080808080808080ULL; 4552082Seschrock mask = (mask << 1) - (mask >> 7); 4562082Seschrock *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ 4572082Seschrock (mask & 0x1d1d1d1d1d1d1d1dULL); 4582082Seschrock } 4592082Seschrock } 4602082Seschrock } 4612082Seschrock 4622082Seschrock src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 4632082Seschrock dst = rm->rm_col[x].rc_data; 4642082Seschrock exp = 255 - (rm->rm_cols - 1 - x); 4652082Seschrock 4662082Seschrock for (i = 0; i < xcount; i++, dst++, src++) { 4672082Seschrock *dst ^= *src; 4682082Seschrock for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 4692082Seschrock *b = vdev_raidz_exp2(*b, exp); 4702082Seschrock } 4712082Seschrock } 4722082Seschrock } 4732082Seschrock 4742082Seschrock static void 4752082Seschrock vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y) 4762082Seschrock { 4772082Seschrock uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 4782082Seschrock void *pdata, *qdata; 4792082Seschrock uint64_t xsize, ysize, i; 4802082Seschrock 4812082Seschrock ASSERT(x < y); 4822082Seschrock ASSERT(x >= rm->rm_firstdatacol); 4832082Seschrock ASSERT(y < rm->rm_cols); 4842082Seschrock 4852082Seschrock ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 4862082Seschrock 4872082Seschrock /* 4882082Seschrock * Move the parity data aside -- we're going to compute parity as 4892082Seschrock * though columns x and y were full of zeros -- Pxy and Qxy. We want to 4902082Seschrock * reuse the parity generation mechanism without trashing the actual 4912082Seschrock * parity so we make those columns appear to be full of zeros by 4922082Seschrock * setting their lengths to zero. 4932082Seschrock */ 4942082Seschrock pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 4952082Seschrock qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 4962082Seschrock xsize = rm->rm_col[x].rc_size; 4972082Seschrock ysize = rm->rm_col[y].rc_size; 4982082Seschrock 4992082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_data = 5002082Seschrock zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 5012082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_data = 5022082Seschrock zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 5032082Seschrock rm->rm_col[x].rc_size = 0; 5042082Seschrock rm->rm_col[y].rc_size = 0; 5052082Seschrock 5062082Seschrock vdev_raidz_generate_parity_pq(rm); 5072082Seschrock 5082082Seschrock rm->rm_col[x].rc_size = xsize; 5092082Seschrock rm->rm_col[y].rc_size = ysize; 5102082Seschrock 5112082Seschrock p = pdata; 5122082Seschrock q = qdata; 5132082Seschrock pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 5142082Seschrock qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 5152082Seschrock xd = rm->rm_col[x].rc_data; 5162082Seschrock yd = rm->rm_col[y].rc_data; 5172082Seschrock 5182082Seschrock /* 5192082Seschrock * We now have: 5202082Seschrock * Pxy = P + D_x + D_y 5212082Seschrock * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 5222082Seschrock * 5232082Seschrock * We can then solve for D_x: 5242082Seschrock * D_x = A * (P + Pxy) + B * (Q + Qxy) 5252082Seschrock * where 5262082Seschrock * A = 2^(x - y) * (2^(x - y) + 1)^-1 5272082Seschrock * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 5282082Seschrock * 5292082Seschrock * With D_x in hand, we can easily solve for D_y: 5302082Seschrock * D_y = P + Pxy + D_x 5312082Seschrock */ 5322082Seschrock 5332082Seschrock a = vdev_raidz_pow2[255 + x - y]; 5342082Seschrock b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 5352082Seschrock tmp = 255 - vdev_raidz_log2[a ^ 1]; 5362082Seschrock 5372082Seschrock aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 5382082Seschrock bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 5392082Seschrock 5402082Seschrock for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 5412082Seschrock *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 5422082Seschrock vdev_raidz_exp2(*q ^ *qxy, bexp); 5432082Seschrock 5442082Seschrock if (i < ysize) 5452082Seschrock *yd = *p ^ *pxy ^ *xd; 5462082Seschrock } 5472082Seschrock 5482082Seschrock zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 5492082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_size); 5502082Seschrock zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 5512082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_size); 5522082Seschrock 5532082Seschrock /* 5542082Seschrock * Restore the saved parity data. 5552082Seschrock */ 5562082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 5572082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 5582082Seschrock } 5592082Seschrock 5602082Seschrock 561789Sahrens static int 562789Sahrens vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) 563789Sahrens { 564789Sahrens vdev_t *cvd; 5652082Seschrock uint64_t nparity = vd->vdev_nparity; 566789Sahrens int c, error; 567789Sahrens int lasterror = 0; 568789Sahrens int numerrors = 0; 569789Sahrens 5702082Seschrock ASSERT(nparity > 0); 5712082Seschrock 5722082Seschrock if (nparity > VDEV_RAIDZ_MAXPARITY || 5732082Seschrock vd->vdev_children < nparity + 1) { 574789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 575789Sahrens return (EINVAL); 576789Sahrens } 577789Sahrens 578789Sahrens for (c = 0; c < vd->vdev_children; c++) { 579789Sahrens cvd = vd->vdev_child[c]; 580789Sahrens 581789Sahrens if ((error = vdev_open(cvd)) != 0) { 582789Sahrens lasterror = error; 583789Sahrens numerrors++; 584789Sahrens continue; 585789Sahrens } 586789Sahrens 587789Sahrens *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 5881732Sbonwick *ashift = MAX(*ashift, cvd->vdev_ashift); 589789Sahrens } 590789Sahrens 591789Sahrens *asize *= vd->vdev_children; 592789Sahrens 5932082Seschrock if (numerrors > nparity) { 594789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 595789Sahrens return (lasterror); 596789Sahrens } 597789Sahrens 598789Sahrens return (0); 599789Sahrens } 600789Sahrens 601789Sahrens static void 602789Sahrens vdev_raidz_close(vdev_t *vd) 603789Sahrens { 604789Sahrens int c; 605789Sahrens 606789Sahrens for (c = 0; c < vd->vdev_children; c++) 607789Sahrens vdev_close(vd->vdev_child[c]); 608789Sahrens } 609789Sahrens 610789Sahrens static uint64_t 611789Sahrens vdev_raidz_asize(vdev_t *vd, uint64_t psize) 612789Sahrens { 613789Sahrens uint64_t asize; 6141732Sbonwick uint64_t ashift = vd->vdev_top->vdev_ashift; 615789Sahrens uint64_t cols = vd->vdev_children; 6162082Seschrock uint64_t nparity = vd->vdev_nparity; 617789Sahrens 6181732Sbonwick asize = ((psize - 1) >> ashift) + 1; 6192082Seschrock asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 6202082Seschrock asize = roundup(asize, nparity + 1) << ashift; 621789Sahrens 622789Sahrens return (asize); 623789Sahrens } 624789Sahrens 625789Sahrens static void 626789Sahrens vdev_raidz_child_done(zio_t *zio) 627789Sahrens { 628789Sahrens raidz_col_t *rc = zio->io_private; 629789Sahrens 630789Sahrens rc->rc_error = zio->io_error; 631789Sahrens rc->rc_tried = 1; 632789Sahrens rc->rc_skipped = 0; 633789Sahrens } 634789Sahrens 635789Sahrens static void 636789Sahrens vdev_raidz_repair_done(zio_t *zio) 637789Sahrens { 6381732Sbonwick ASSERT(zio->io_private == zio->io_parent); 6391732Sbonwick vdev_raidz_map_free(zio->io_private); 640789Sahrens } 641789Sahrens 642789Sahrens static void 643789Sahrens vdev_raidz_io_start(zio_t *zio) 644789Sahrens { 645789Sahrens vdev_t *vd = zio->io_vd; 6461732Sbonwick vdev_t *tvd = vd->vdev_top; 647789Sahrens vdev_t *cvd; 648789Sahrens blkptr_t *bp = zio->io_bp; 649789Sahrens raidz_map_t *rm; 650789Sahrens raidz_col_t *rc; 651789Sahrens int c; 652789Sahrens 6532082Seschrock rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, 6542082Seschrock vd->vdev_nparity); 655789Sahrens 6561775Sbillm ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 657789Sahrens 658789Sahrens if (zio->io_type == ZIO_TYPE_WRITE) { 659789Sahrens /* 6602082Seschrock * Generate RAID parity in the first virtual columns. 661789Sahrens */ 6622082Seschrock if (rm->rm_firstdatacol == 1) 6632082Seschrock vdev_raidz_generate_parity_p(rm); 6642082Seschrock else 6652082Seschrock vdev_raidz_generate_parity_pq(rm); 666789Sahrens 667789Sahrens for (c = 0; c < rm->rm_cols; c++) { 668789Sahrens rc = &rm->rm_col[c]; 6692082Seschrock cvd = vd->vdev_child[rc->rc_devidx]; 670789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 671789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 672789Sahrens zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, 673789Sahrens vdev_raidz_child_done, rc)); 674789Sahrens } 675789Sahrens zio_wait_children_done(zio); 676789Sahrens return; 677789Sahrens } 678789Sahrens 679789Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ); 680789Sahrens 6812082Seschrock /* 6822082Seschrock * Iterate over the columns in reverse order so that we hit the parity 6832082Seschrock * last -- any errors along the way will force us to read the parity 6842082Seschrock * data. 6852082Seschrock */ 686789Sahrens for (c = rm->rm_cols - 1; c >= 0; c--) { 687789Sahrens rc = &rm->rm_col[c]; 6882082Seschrock cvd = vd->vdev_child[rc->rc_devidx]; 689789Sahrens if (vdev_is_dead(cvd)) { 6902082Seschrock if (c >= rm->rm_firstdatacol) 6912082Seschrock rm->rm_missingdata++; 6922082Seschrock else 6932082Seschrock rm->rm_missingparity++; 694789Sahrens rc->rc_error = ENXIO; 695789Sahrens rc->rc_tried = 1; /* don't even try */ 696789Sahrens rc->rc_skipped = 1; 697789Sahrens continue; 698789Sahrens } 699789Sahrens if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) { 7002082Seschrock if (c >= rm->rm_firstdatacol) 7012082Seschrock rm->rm_missingdata++; 7022082Seschrock else 7032082Seschrock rm->rm_missingparity++; 704789Sahrens rc->rc_error = ESTALE; 705789Sahrens rc->rc_skipped = 1; 706789Sahrens continue; 707789Sahrens } 7082082Seschrock if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 709789Sahrens (zio->io_flags & ZIO_FLAG_SCRUB)) { 710789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 711789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 712789Sahrens zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, 713789Sahrens vdev_raidz_child_done, rc)); 714789Sahrens } 715789Sahrens } 716789Sahrens 717789Sahrens zio_wait_children_done(zio); 718789Sahrens } 719789Sahrens 7201544Seschrock /* 7211544Seschrock * Report a checksum error for a child of a RAID-Z device. 7221544Seschrock */ 7231544Seschrock static void 7241544Seschrock raidz_checksum_error(zio_t *zio, raidz_col_t *rc) 7251544Seschrock { 7262082Seschrock vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 7271544Seschrock dprintf_bp(zio->io_bp, "imputed checksum error on %s: ", 7281544Seschrock vdev_description(vd)); 7291544Seschrock 7301544Seschrock if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 7311544Seschrock mutex_enter(&vd->vdev_stat_lock); 7321544Seschrock vd->vdev_stat.vs_checksum_errors++; 7331544Seschrock mutex_exit(&vd->vdev_stat_lock); 7341544Seschrock } 7351544Seschrock 7361544Seschrock if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 7371544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 7381544Seschrock zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size); 7391544Seschrock } 7401544Seschrock 7412082Seschrock /* 7422082Seschrock * Generate the parity from the data columns. If we tried and were able to 7432082Seschrock * read the parity without error, verify that the generated parity matches the 7442082Seschrock * data we read. If it doesn't, we fire off a checksum error. Return the 7452082Seschrock * number such failures. 7462082Seschrock */ 7472082Seschrock static int 7482082Seschrock raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 7492082Seschrock { 7502082Seschrock void *orig[VDEV_RAIDZ_MAXPARITY]; 7512082Seschrock int c, ret = 0; 7522082Seschrock raidz_col_t *rc; 7532082Seschrock 7542082Seschrock for (c = 0; c < rm->rm_firstdatacol; c++) { 7552082Seschrock rc = &rm->rm_col[c]; 7562082Seschrock if (!rc->rc_tried || rc->rc_error != 0) 7572082Seschrock continue; 7582082Seschrock orig[c] = zio_buf_alloc(rc->rc_size); 7592082Seschrock bcopy(rc->rc_data, orig[c], rc->rc_size); 7602082Seschrock } 7612082Seschrock 7622082Seschrock if (rm->rm_firstdatacol == 1) 7632082Seschrock vdev_raidz_generate_parity_p(rm); 7642082Seschrock else 7652082Seschrock vdev_raidz_generate_parity_pq(rm); 7662082Seschrock 7672082Seschrock for (c = 0; c < rm->rm_firstdatacol; c++) { 7682082Seschrock rc = &rm->rm_col[c]; 7692082Seschrock if (!rc->rc_tried || rc->rc_error != 0) 7702082Seschrock continue; 7712082Seschrock if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 7722082Seschrock raidz_checksum_error(zio, rc); 7732082Seschrock rc->rc_error = ECKSUM; 7742082Seschrock ret++; 7752082Seschrock } 7762082Seschrock zio_buf_free(orig[c], rc->rc_size); 7772082Seschrock } 7782082Seschrock 7792082Seschrock return (ret); 7802082Seschrock } 7812082Seschrock 7822082Seschrock static uint64_t raidz_corrected_p; 7832082Seschrock static uint64_t raidz_corrected_q; 7842082Seschrock static uint64_t raidz_corrected_pq; 7851544Seschrock 786789Sahrens static void 787789Sahrens vdev_raidz_io_done(zio_t *zio) 788789Sahrens { 789789Sahrens vdev_t *vd = zio->io_vd; 790789Sahrens vdev_t *cvd; 791789Sahrens raidz_map_t *rm = zio->io_vsd; 7922082Seschrock raidz_col_t *rc, *rc1; 793789Sahrens int unexpected_errors = 0; 7942082Seschrock int parity_errors = 0; 795*3456Sahl int parity_untried = 0; 7962082Seschrock int data_errors = 0; 7972082Seschrock int n, c, c1; 798789Sahrens 7991775Sbillm ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 800789Sahrens 801789Sahrens zio->io_error = 0; 802789Sahrens zio->io_numerrors = 0; 803789Sahrens 8042082Seschrock ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 8052082Seschrock ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 8062082Seschrock 807789Sahrens for (c = 0; c < rm->rm_cols; c++) { 808789Sahrens rc = &rm->rm_col[c]; 809789Sahrens 810789Sahrens /* 811789Sahrens * We preserve any EIOs because those may be worth retrying; 812789Sahrens * whereas ECKSUM and ENXIO are more likely to be persistent. 813789Sahrens */ 814789Sahrens if (rc->rc_error) { 815789Sahrens if (zio->io_error != EIO) 816789Sahrens zio->io_error = rc->rc_error; 8172082Seschrock 8182082Seschrock if (c < rm->rm_firstdatacol) 8192082Seschrock parity_errors++; 8202082Seschrock else 8212082Seschrock data_errors++; 8222082Seschrock 823789Sahrens if (!rc->rc_skipped) 824789Sahrens unexpected_errors++; 8252082Seschrock 826789Sahrens zio->io_numerrors++; 827*3456Sahl } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 828*3456Sahl parity_untried++; 829789Sahrens } 830789Sahrens } 831789Sahrens 832789Sahrens if (zio->io_type == ZIO_TYPE_WRITE) { 833789Sahrens /* 834789Sahrens * If this is not a failfast write, and we were able to 835789Sahrens * write enough columns to reconstruct the data, good enough. 836789Sahrens */ 837789Sahrens /* XXPOLICY */ 838789Sahrens if (zio->io_numerrors <= rm->rm_firstdatacol && 839789Sahrens !(zio->io_flags & ZIO_FLAG_FAILFAST)) 840789Sahrens zio->io_error = 0; 841789Sahrens 842789Sahrens vdev_raidz_map_free(zio); 843789Sahrens zio_next_stage(zio); 844789Sahrens return; 845789Sahrens } 846789Sahrens 847789Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ); 8482082Seschrock /* 8492082Seschrock * There are three potential phases for a read: 8502082Seschrock * 1. produce valid data from the columns read 8512082Seschrock * 2. read all disks and try again 8522082Seschrock * 3. perform combinatorial reconstruction 8532082Seschrock * 8542082Seschrock * Each phase is progressively both more expensive and less likely to 8552082Seschrock * occur. If we encounter more errors than we can repair or all phases 8562082Seschrock * fail, we have no choice but to return an error. 8572082Seschrock */ 858789Sahrens 859789Sahrens /* 8602082Seschrock * If the number of errors we saw was correctable -- less than or equal 861*3456Sahl * to the number of parity disks read -- attempt to produce data that 862*3456Sahl * has a valid checksum. Naturally, this case applies in the absence of 863*3456Sahl * any errors. 864789Sahrens */ 865*3456Sahl if (zio->io_numerrors <= rm->rm_firstdatacol - parity_untried) { 8662082Seschrock switch (data_errors) { 8672082Seschrock case 0: 8682082Seschrock if (zio_checksum_error(zio) == 0) { 8692082Seschrock zio->io_error = 0; 870*3456Sahl if (parity_errors + parity_untried < 871*3456Sahl rm->rm_firstdatacol) { 872*3456Sahl n = raidz_parity_verify(zio, rm); 873*3456Sahl unexpected_errors += n; 874*3456Sahl ASSERT(parity_errors + n <= 875*3456Sahl rm->rm_firstdatacol); 876*3456Sahl } 8772082Seschrock goto done; 8782082Seschrock } 8792082Seschrock break; 8802082Seschrock 8812082Seschrock case 1: 882*3456Sahl /* 883*3456Sahl * We either attempt to read all the parity columns or 884*3456Sahl * none of them. If we didn't try to read parity, we 885*3456Sahl * wouldn't be here in the correctable case. There must 886*3456Sahl * also have been fewer parity errors than parity 887*3456Sahl * columns or, again, we wouldn't be in this code path. 888*3456Sahl */ 889*3456Sahl ASSERT(parity_untried == 0); 8902082Seschrock ASSERT(parity_errors < rm->rm_firstdatacol); 8912082Seschrock 8922082Seschrock /* 8932082Seschrock * Find the column that reported the error. 8942082Seschrock */ 8952082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 8962082Seschrock rc = &rm->rm_col[c]; 8972082Seschrock if (rc->rc_error != 0) 8982082Seschrock break; 8992082Seschrock } 9002082Seschrock ASSERT(c != rm->rm_cols); 9012082Seschrock ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || 9022082Seschrock rc->rc_error == ESTALE); 9032082Seschrock 9042082Seschrock if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { 9052082Seschrock vdev_raidz_reconstruct_p(rm, c); 9062082Seschrock } else { 9072082Seschrock ASSERT(rm->rm_firstdatacol > 1); 9082082Seschrock vdev_raidz_reconstruct_q(rm, c); 9092082Seschrock } 9102082Seschrock 9112082Seschrock if (zio_checksum_error(zio) == 0) { 9122082Seschrock zio->io_error = 0; 9132082Seschrock if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) 9142082Seschrock atomic_inc_64(&raidz_corrected_p); 9152082Seschrock else 9162082Seschrock atomic_inc_64(&raidz_corrected_q); 917789Sahrens 9182082Seschrock /* 919*3456Sahl * If there's more than one parity disk that 920*3456Sahl * was successfully read, confirm that the 921*3456Sahl * other parity disk produced the correct data. 922*3456Sahl * This routine is suboptimal in that it 923*3456Sahl * regenerates both the parity we wish to test 924*3456Sahl * as well as the parity we just used to 925*3456Sahl * perform the reconstruction, but this should 926*3456Sahl * be a relatively uncommon case, and can be 927*3456Sahl * optimized if it becomes a problem. 9282082Seschrock */ 929*3456Sahl if (parity_errors < rm->rm_firstdatacol - 1) { 9302082Seschrock n = raidz_parity_verify(zio, rm); 9312082Seschrock unexpected_errors += n; 9322082Seschrock ASSERT(parity_errors + n <= 9332082Seschrock rm->rm_firstdatacol); 9342082Seschrock } 9352082Seschrock 9362082Seschrock goto done; 9372082Seschrock } 9382082Seschrock break; 9392082Seschrock 9402082Seschrock case 2: 9412082Seschrock /* 942*3456Sahl * Two data column errors require double parity. 943*3456Sahl */ 944*3456Sahl ASSERT(rm->rm_firstdatacol == 2); 945*3456Sahl 946*3456Sahl /* 9472082Seschrock * Find the two columns that reported errors. 9482082Seschrock */ 9492082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 9502082Seschrock rc = &rm->rm_col[c]; 9512082Seschrock if (rc->rc_error != 0) 9522082Seschrock break; 953789Sahrens } 9542082Seschrock ASSERT(c != rm->rm_cols); 9552082Seschrock ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || 9562082Seschrock rc->rc_error == ESTALE); 9572082Seschrock 9582082Seschrock for (c1 = c++; c < rm->rm_cols; c++) { 9592082Seschrock rc = &rm->rm_col[c]; 9602082Seschrock if (rc->rc_error != 0) 9612082Seschrock break; 9622082Seschrock } 9632082Seschrock ASSERT(c != rm->rm_cols); 9642082Seschrock ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || 9652082Seschrock rc->rc_error == ESTALE); 966789Sahrens 9672082Seschrock vdev_raidz_reconstruct_pq(rm, c1, c); 9682082Seschrock 9692082Seschrock if (zio_checksum_error(zio) == 0) { 9702082Seschrock zio->io_error = 0; 9712082Seschrock atomic_inc_64(&raidz_corrected_pq); 9722082Seschrock 9732082Seschrock goto done; 9742082Seschrock } 9752082Seschrock break; 9762082Seschrock 9772082Seschrock default: 9782082Seschrock ASSERT(rm->rm_firstdatacol <= 2); 9792082Seschrock ASSERT(0); 980789Sahrens } 981789Sahrens } 982789Sahrens 983789Sahrens /* 9842082Seschrock * This isn't a typical situation -- either we got a read error or 9852082Seschrock * a child silently returned bad data. Read every block so we can 9862082Seschrock * try again with as much data and parity as we can track down. If 9872082Seschrock * we've already been through once before, all children will be marked 9882082Seschrock * as tried so we'll proceed to combinatorial reconstruction. 989789Sahrens */ 990789Sahrens unexpected_errors = 1; 9912082Seschrock rm->rm_missingdata = 0; 9922082Seschrock rm->rm_missingparity = 0; 993789Sahrens 9942082Seschrock for (c = 0; c < rm->rm_cols; c++) { 9952082Seschrock if (rm->rm_col[c].rc_tried) 9962082Seschrock continue; 997789Sahrens 998789Sahrens zio->io_error = 0; 999789Sahrens zio_vdev_io_redone(zio); 10002082Seschrock do { 1001789Sahrens rc = &rm->rm_col[c]; 1002789Sahrens if (rc->rc_tried) 1003789Sahrens continue; 1004789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, 10052082Seschrock vd->vdev_child[rc->rc_devidx], 1006789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 1007789Sahrens zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, 1008789Sahrens vdev_raidz_child_done, rc)); 10092082Seschrock } while (++c < rm->rm_cols); 10102082Seschrock dprintf("rereading\n"); 1011789Sahrens zio_wait_children_done(zio); 1012789Sahrens return; 1013789Sahrens } 1014789Sahrens 1015789Sahrens /* 10162082Seschrock * At this point we've attempted to reconstruct the data given the 10172082Seschrock * errors we detected, and we've attempted to read all columns. There 10182082Seschrock * must, therefore, be one or more additional problems -- silent errors 10192082Seschrock * resulting in invalid data rather than explicit I/O errors resulting 10202082Seschrock * in absent data. Before we attempt combinatorial reconstruction make 10212082Seschrock * sure we have a chance of coming up with the right answer. 1022789Sahrens */ 10232082Seschrock if (zio->io_numerrors >= rm->rm_firstdatacol) { 1024789Sahrens ASSERT(zio->io_error != 0); 1025789Sahrens goto done; 1026789Sahrens } 1027789Sahrens 10282082Seschrock if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { 10292082Seschrock /* 10302082Seschrock * Attempt to reconstruct the data from parity P. 10312082Seschrock */ 10322082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 10332082Seschrock void *orig; 10342082Seschrock rc = &rm->rm_col[c]; 10352082Seschrock 10362082Seschrock orig = zio_buf_alloc(rc->rc_size); 10372082Seschrock bcopy(rc->rc_data, orig, rc->rc_size); 10382082Seschrock vdev_raidz_reconstruct_p(rm, c); 10392082Seschrock 10402082Seschrock if (zio_checksum_error(zio) == 0) { 10412082Seschrock zio_buf_free(orig, rc->rc_size); 10422082Seschrock zio->io_error = 0; 10432082Seschrock atomic_inc_64(&raidz_corrected_p); 10442082Seschrock 10452082Seschrock /* 10462082Seschrock * If this child didn't know that it returned 10472082Seschrock * bad data, inform it. 10482082Seschrock */ 10492082Seschrock if (rc->rc_tried && rc->rc_error == 0) 10502082Seschrock raidz_checksum_error(zio, rc); 10512082Seschrock rc->rc_error = ECKSUM; 10522082Seschrock goto done; 10532082Seschrock } 10542082Seschrock 10552082Seschrock bcopy(orig, rc->rc_data, rc->rc_size); 10562082Seschrock zio_buf_free(orig, rc->rc_size); 10572082Seschrock } 10582082Seschrock } 10592082Seschrock 10602082Seschrock if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { 10612082Seschrock /* 10622082Seschrock * Attempt to reconstruct the data from parity Q. 10632082Seschrock */ 10642082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 10652082Seschrock void *orig; 10662082Seschrock rc = &rm->rm_col[c]; 10672082Seschrock 10682082Seschrock orig = zio_buf_alloc(rc->rc_size); 10692082Seschrock bcopy(rc->rc_data, orig, rc->rc_size); 10702082Seschrock vdev_raidz_reconstruct_q(rm, c); 10712082Seschrock 10722082Seschrock if (zio_checksum_error(zio) == 0) { 10732082Seschrock zio_buf_free(orig, rc->rc_size); 1074789Sahrens zio->io_error = 0; 10752082Seschrock atomic_inc_64(&raidz_corrected_q); 10762082Seschrock 10772082Seschrock /* 10782082Seschrock * If this child didn't know that it returned 10792082Seschrock * bad data, inform it. 10802082Seschrock */ 10812082Seschrock if (rc->rc_tried && rc->rc_error == 0) 10822082Seschrock raidz_checksum_error(zio, rc); 10832082Seschrock rc->rc_error = ECKSUM; 10842082Seschrock goto done; 10852082Seschrock } 10862082Seschrock 10872082Seschrock bcopy(orig, rc->rc_data, rc->rc_size); 10882082Seschrock zio_buf_free(orig, rc->rc_size); 10892082Seschrock } 10902082Seschrock } 10912082Seschrock 10922082Seschrock if (rm->rm_firstdatacol > 1 && 10932082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 && 10942082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { 10952082Seschrock /* 10962082Seschrock * Attempt to reconstruct the data from both P and Q. 10972082Seschrock */ 10982082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) { 10992082Seschrock void *orig, *orig1; 11002082Seschrock rc = &rm->rm_col[c]; 11012082Seschrock 11022082Seschrock orig = zio_buf_alloc(rc->rc_size); 11032082Seschrock bcopy(rc->rc_data, orig, rc->rc_size); 11042082Seschrock 11052082Seschrock for (c1 = c + 1; c1 < rm->rm_cols; c1++) { 11062082Seschrock rc1 = &rm->rm_col[c1]; 11072082Seschrock 11082082Seschrock orig1 = zio_buf_alloc(rc1->rc_size); 11092082Seschrock bcopy(rc1->rc_data, orig1, rc1->rc_size); 11102082Seschrock 11112082Seschrock vdev_raidz_reconstruct_pq(rm, c, c1); 11122082Seschrock 11132082Seschrock if (zio_checksum_error(zio) == 0) { 11142082Seschrock zio_buf_free(orig, rc->rc_size); 11152082Seschrock zio_buf_free(orig1, rc1->rc_size); 11162082Seschrock zio->io_error = 0; 11172082Seschrock atomic_inc_64(&raidz_corrected_pq); 11182082Seschrock 11192082Seschrock /* 11202082Seschrock * If these children didn't know they 11212082Seschrock * returned bad data, inform them. 11222082Seschrock */ 11232082Seschrock if (rc->rc_tried && rc->rc_error == 0) 11242082Seschrock raidz_checksum_error(zio, rc); 11252082Seschrock if (rc1->rc_tried && rc1->rc_error == 0) 11262082Seschrock raidz_checksum_error(zio, rc1); 11272082Seschrock 11282082Seschrock rc->rc_error = ECKSUM; 11292082Seschrock rc1->rc_error = ECKSUM; 11302082Seschrock 11312082Seschrock goto done; 11322082Seschrock } 11332082Seschrock 11342082Seschrock bcopy(orig1, rc1->rc_data, rc1->rc_size); 11352082Seschrock zio_buf_free(orig1, rc1->rc_size); 11362082Seschrock } 11372082Seschrock 11382082Seschrock bcopy(orig, rc->rc_data, rc->rc_size); 11392082Seschrock zio_buf_free(orig, rc->rc_size); 1140789Sahrens } 1141789Sahrens } 1142789Sahrens 1143789Sahrens /* 11442082Seschrock * All combinations failed to checksum. Generate checksum ereports for 11452082Seschrock * all children. 1146789Sahrens */ 1147789Sahrens zio->io_error = ECKSUM; 11481544Seschrock if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 11491544Seschrock for (c = 0; c < rm->rm_cols; c++) { 11501544Seschrock rc = &rm->rm_col[c]; 11511544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 11522082Seschrock zio->io_spa, vd->vdev_child[rc->rc_devidx], zio, 11531544Seschrock rc->rc_offset, rc->rc_size); 11541544Seschrock } 11551544Seschrock } 1156789Sahrens 1157789Sahrens done: 1158789Sahrens zio_checksum_verified(zio); 1159789Sahrens 1160789Sahrens if (zio->io_error == 0 && (spa_mode & FWRITE) && 1161789Sahrens (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 11621732Sbonwick zio_t *rio; 11631732Sbonwick 1164789Sahrens /* 1165789Sahrens * Use the good data we have in hand to repair damaged children. 11661732Sbonwick * 11671732Sbonwick * We issue all repair I/Os as children of 'rio' to arrange 11681732Sbonwick * that vdev_raidz_map_free(zio) will be invoked after all 11691732Sbonwick * repairs complete, but before we advance to the next stage. 1170789Sahrens */ 11711732Sbonwick rio = zio_null(zio, zio->io_spa, 11721732Sbonwick vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL); 11731732Sbonwick 1174789Sahrens for (c = 0; c < rm->rm_cols; c++) { 1175789Sahrens rc = &rm->rm_col[c]; 11762082Seschrock cvd = vd->vdev_child[rc->rc_devidx]; 1177789Sahrens 11781732Sbonwick if (rc->rc_error == 0) 11791732Sbonwick continue; 11801732Sbonwick 11811732Sbonwick dprintf("%s resilvered %s @ 0x%llx error %d\n", 11821732Sbonwick vdev_description(vd), 11831732Sbonwick vdev_description(cvd), 11841732Sbonwick zio->io_offset, rc->rc_error); 1185789Sahrens 11861732Sbonwick zio_nowait(zio_vdev_child_io(rio, NULL, cvd, 11871732Sbonwick rc->rc_offset, rc->rc_data, rc->rc_size, 11881732Sbonwick ZIO_TYPE_WRITE, zio->io_priority, 11892082Seschrock ZIO_FLAG_IO_REPAIR | ZIO_FLAG_DONT_PROPAGATE | 11902082Seschrock ZIO_FLAG_CANFAIL, NULL, NULL)); 11911732Sbonwick } 1192789Sahrens 11931732Sbonwick zio_nowait(rio); 11941732Sbonwick zio_wait_children_done(zio); 11951732Sbonwick return; 1196789Sahrens } 1197789Sahrens 1198789Sahrens vdev_raidz_map_free(zio); 1199789Sahrens zio_next_stage(zio); 1200789Sahrens } 1201789Sahrens 1202789Sahrens static void 1203789Sahrens vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 1204789Sahrens { 12052082Seschrock if (faulted > vd->vdev_nparity) 12061544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 12071544Seschrock VDEV_AUX_NO_REPLICAS); 1208789Sahrens else if (degraded + faulted != 0) 12091544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 1210789Sahrens else 12111544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 1212789Sahrens } 1213789Sahrens 1214789Sahrens vdev_ops_t vdev_raidz_ops = { 1215789Sahrens vdev_raidz_open, 1216789Sahrens vdev_raidz_close, 1217789Sahrens vdev_raidz_asize, 1218789Sahrens vdev_raidz_io_start, 1219789Sahrens vdev_raidz_io_done, 1220789Sahrens vdev_raidz_state_change, 1221789Sahrens VDEV_TYPE_RAIDZ, /* name of this vdev type */ 1222789Sahrens B_FALSE /* not a leaf vdev */ 1223789Sahrens }; 1224