1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21*2082Seschrock 22789Sahrens /* 231544Seschrock * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24789Sahrens * Use is subject to license terms. 25789Sahrens */ 26789Sahrens 27789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28789Sahrens 29789Sahrens #include <sys/zfs_context.h> 30789Sahrens #include <sys/spa.h> 31789Sahrens #include <sys/vdev_impl.h> 32789Sahrens #include <sys/zio.h> 33789Sahrens #include <sys/zio_checksum.h> 34789Sahrens #include <sys/fs/zfs.h> 351544Seschrock #include <sys/fm/fs/zfs.h> 36789Sahrens 37789Sahrens /* 38789Sahrens * Virtual device vector for RAID-Z. 39*2082Seschrock * 40*2082Seschrock * This vdev supports both single and double parity. For single parity, we 41*2082Seschrock * use a simple XOR of all the data columns. For double parity, we use both 42*2082Seschrock * the simple XOR as well as a technique described in "The mathematics of 43*2082Seschrock * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8), 44*2082Seschrock * over the integers expressable in a single byte. Briefly, the operations on 45*2082Seschrock * the field are defined as follows: 46*2082Seschrock * 47*2082Seschrock * o addition (+) is represented by a bitwise XOR 48*2082Seschrock * o subtraction (-) is therefore identical to addition: A + B = A - B 49*2082Seschrock * o multiplication of A by 2 is defined by the following bitwise expression: 50*2082Seschrock * (A * 2)_7 = A_6 51*2082Seschrock * (A * 2)_6 = A_5 52*2082Seschrock * (A * 2)_5 = A_4 53*2082Seschrock * (A * 2)_4 = A_3 + A_7 54*2082Seschrock * (A * 2)_3 = A_2 + A_7 55*2082Seschrock * (A * 2)_2 = A_1 + A_7 56*2082Seschrock * (A * 2)_1 = A_0 57*2082Seschrock * (A * 2)_0 = A_7 58*2082Seschrock * 59*2082Seschrock * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 60*2082Seschrock * 61*2082Seschrock * Observe that any number in the field (except for 0) can be expressed as a 62*2082Seschrock * power of 2 -- a generator for the field. We store a table of the powers of 63*2082Seschrock * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 64*2082Seschrock * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 65*2082Seschrock * than field addition). The inverse of a field element A (A^-1) is A^254. 66*2082Seschrock * 67*2082Seschrock * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1, 68*2082Seschrock * can be expressed by field operations: 69*2082Seschrock * 70*2082Seschrock * P = D_0 + D_1 + ... + D_n-2 + D_n-1 71*2082Seschrock * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 72*2082Seschrock * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 73*2082Seschrock * 74*2082Seschrock * See the reconstruction code below for how P and Q can used individually or 75*2082Seschrock * in concert to recover missing data columns. 76789Sahrens */ 77789Sahrens 78789Sahrens typedef struct raidz_col { 79*2082Seschrock uint64_t rc_devidx; /* child device index for I/O */ 80*2082Seschrock uint64_t rc_offset; /* device offset */ 81*2082Seschrock uint64_t rc_size; /* I/O size */ 82*2082Seschrock void *rc_data; /* I/O data */ 83*2082Seschrock int rc_error; /* I/O error for this device */ 84*2082Seschrock uint8_t rc_tried; /* Did we attempt this I/O column? */ 85*2082Seschrock uint8_t rc_skipped; /* Did we skip this I/O column? */ 86789Sahrens } raidz_col_t; 87789Sahrens 88789Sahrens typedef struct raidz_map { 89*2082Seschrock uint64_t rm_cols; /* Column count */ 90*2082Seschrock uint64_t rm_bigcols; /* Number of oversized columns */ 91*2082Seschrock uint64_t rm_asize; /* Actual total I/O size */ 92*2082Seschrock uint64_t rm_missingdata; /* Count of missing data devices */ 93*2082Seschrock uint64_t rm_missingparity; /* Count of missing parity devices */ 94*2082Seschrock uint64_t rm_firstdatacol; /* First data column/parity count */ 95*2082Seschrock raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 96789Sahrens } raidz_map_t; 97789Sahrens 98*2082Seschrock #define VDEV_RAIDZ_P 0 99*2082Seschrock #define VDEV_RAIDZ_Q 1 100*2082Seschrock 101*2082Seschrock #define VDEV_RAIDZ_MAXPARITY 2 102*2082Seschrock 103*2082Seschrock #define VDEV_RAIDZ_MUL_2(a) (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0)) 104*2082Seschrock 105*2082Seschrock /* 106*2082Seschrock * These two tables represent powers and logs of 2 in the Galois field defined 107*2082Seschrock * above. These values were computed by repeatedly multiplying by 2 as above. 108*2082Seschrock */ 109*2082Seschrock static const uint8_t vdev_raidz_pow2[256] = { 110*2082Seschrock 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 111*2082Seschrock 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 112*2082Seschrock 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 113*2082Seschrock 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 114*2082Seschrock 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 115*2082Seschrock 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 116*2082Seschrock 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 117*2082Seschrock 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 118*2082Seschrock 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 119*2082Seschrock 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 120*2082Seschrock 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 121*2082Seschrock 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 122*2082Seschrock 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 123*2082Seschrock 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 124*2082Seschrock 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 125*2082Seschrock 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 126*2082Seschrock 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 127*2082Seschrock 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 128*2082Seschrock 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 129*2082Seschrock 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 130*2082Seschrock 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 131*2082Seschrock 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 132*2082Seschrock 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 133*2082Seschrock 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 134*2082Seschrock 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 135*2082Seschrock 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 136*2082Seschrock 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 137*2082Seschrock 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 138*2082Seschrock 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 139*2082Seschrock 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 140*2082Seschrock 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 141*2082Seschrock 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 142*2082Seschrock }; 143*2082Seschrock static const uint8_t vdev_raidz_log2[256] = { 144*2082Seschrock 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 145*2082Seschrock 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 146*2082Seschrock 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 147*2082Seschrock 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 148*2082Seschrock 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 149*2082Seschrock 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 150*2082Seschrock 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 151*2082Seschrock 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 152*2082Seschrock 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 153*2082Seschrock 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 154*2082Seschrock 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 155*2082Seschrock 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 156*2082Seschrock 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 157*2082Seschrock 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 158*2082Seschrock 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 159*2082Seschrock 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 160*2082Seschrock 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 161*2082Seschrock 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 162*2082Seschrock 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 163*2082Seschrock 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 164*2082Seschrock 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 165*2082Seschrock 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 166*2082Seschrock 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 167*2082Seschrock 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 168*2082Seschrock 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 169*2082Seschrock 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 170*2082Seschrock 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 171*2082Seschrock 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 172*2082Seschrock 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 173*2082Seschrock 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 174*2082Seschrock 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 175*2082Seschrock 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 176*2082Seschrock }; 177*2082Seschrock 178*2082Seschrock /* 179*2082Seschrock * Multiply a given number by 2 raised to the given power. 180*2082Seschrock */ 181*2082Seschrock static uint8_t 182*2082Seschrock vdev_raidz_exp2(uint_t a, int exp) 183*2082Seschrock { 184*2082Seschrock if (a == 0) 185*2082Seschrock return (0); 186*2082Seschrock 187*2082Seschrock ASSERT(exp >= 0); 188*2082Seschrock ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 189*2082Seschrock 190*2082Seschrock exp += vdev_raidz_log2[a]; 191*2082Seschrock if (exp > 255) 192*2082Seschrock exp -= 255; 193*2082Seschrock 194*2082Seschrock return (vdev_raidz_pow2[exp]); 195*2082Seschrock } 196*2082Seschrock 197789Sahrens static raidz_map_t * 198*2082Seschrock vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, 199*2082Seschrock uint64_t nparity) 200789Sahrens { 201789Sahrens raidz_map_t *rm; 202789Sahrens uint64_t b = zio->io_offset >> unit_shift; 203789Sahrens uint64_t s = zio->io_size >> unit_shift; 204789Sahrens uint64_t f = b % dcols; 205789Sahrens uint64_t o = (b / dcols) << unit_shift; 206*2082Seschrock uint64_t q, r, c, bc, col, acols, coff, devidx; 207789Sahrens 208*2082Seschrock q = s / (dcols - nparity); 209*2082Seschrock r = s - q * (dcols - nparity); 210*2082Seschrock bc = (r == 0 ? 0 : r + nparity); 211789Sahrens 212789Sahrens acols = (q == 0 ? bc : dcols); 213789Sahrens 214789Sahrens rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP); 215789Sahrens 216789Sahrens rm->rm_cols = acols; 217789Sahrens rm->rm_bigcols = bc; 218789Sahrens rm->rm_asize = 0; 219*2082Seschrock rm->rm_missingdata = 0; 220*2082Seschrock rm->rm_missingparity = 0; 221*2082Seschrock rm->rm_firstdatacol = nparity; 222789Sahrens 223789Sahrens for (c = 0; c < acols; c++) { 224789Sahrens col = f + c; 225789Sahrens coff = o; 226789Sahrens if (col >= dcols) { 227789Sahrens col -= dcols; 228789Sahrens coff += 1ULL << unit_shift; 229789Sahrens } 230*2082Seschrock rm->rm_col[c].rc_devidx = col; 231789Sahrens rm->rm_col[c].rc_offset = coff; 232789Sahrens rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift; 233789Sahrens rm->rm_col[c].rc_data = NULL; 234789Sahrens rm->rm_col[c].rc_error = 0; 235789Sahrens rm->rm_col[c].rc_tried = 0; 236789Sahrens rm->rm_col[c].rc_skipped = 0; 237789Sahrens rm->rm_asize += rm->rm_col[c].rc_size; 238789Sahrens } 239789Sahrens 240*2082Seschrock rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift); 241789Sahrens 242789Sahrens for (c = 0; c < rm->rm_firstdatacol; c++) 243789Sahrens rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); 244789Sahrens 245789Sahrens rm->rm_col[c].rc_data = zio->io_data; 246789Sahrens 247789Sahrens for (c = c + 1; c < acols; c++) 248789Sahrens rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 249789Sahrens rm->rm_col[c - 1].rc_size; 250789Sahrens 2511133Seschrock /* 252*2082Seschrock * If all data stored spans all columns, there's a danger that parity 253*2082Seschrock * will always be on the same device and, since parity isn't read 254*2082Seschrock * during normal operation, that that device's I/O bandwidth won't be 255*2082Seschrock * used effectively. We therefore switch the parity every 1MB. 256*2082Seschrock * 257*2082Seschrock * ... at least that was, ostensibly, the theory. As a practical 258*2082Seschrock * matter unless we juggle the parity between all devices evenly, we 259*2082Seschrock * won't see any benefit. Further, occasional writes that aren't a 260*2082Seschrock * multiple of the LCM of the number of children and the minimum 261*2082Seschrock * stripe width are sufficient to avoid pessimal behavior. 262*2082Seschrock * Unfortunately, this decision created an implicit on-disk format 263*2082Seschrock * requirement that we need to support for all eternity (but only for 264*2082Seschrock * RAID-Z with one parity device). 2651133Seschrock */ 2661133Seschrock ASSERT(rm->rm_cols >= 2); 2671133Seschrock ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 268789Sahrens 269*2082Seschrock if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 270*2082Seschrock devidx = rm->rm_col[0].rc_devidx; 2711133Seschrock o = rm->rm_col[0].rc_offset; 272*2082Seschrock rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 2731133Seschrock rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 274*2082Seschrock rm->rm_col[1].rc_devidx = devidx; 2751133Seschrock rm->rm_col[1].rc_offset = o; 276789Sahrens } 277789Sahrens 278789Sahrens zio->io_vsd = rm; 279789Sahrens return (rm); 280789Sahrens } 281789Sahrens 282789Sahrens static void 283789Sahrens vdev_raidz_map_free(zio_t *zio) 284789Sahrens { 285789Sahrens raidz_map_t *rm = zio->io_vsd; 286789Sahrens int c; 287789Sahrens 288789Sahrens for (c = 0; c < rm->rm_firstdatacol; c++) 289789Sahrens zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 290789Sahrens 291789Sahrens kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols])); 292789Sahrens zio->io_vsd = NULL; 293789Sahrens } 294789Sahrens 295789Sahrens static void 296*2082Seschrock vdev_raidz_generate_parity_p(raidz_map_t *rm) 297*2082Seschrock { 298*2082Seschrock uint64_t *p, *src, pcount, ccount, i; 299*2082Seschrock int c; 300*2082Seschrock 301*2082Seschrock pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 302*2082Seschrock 303*2082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 304*2082Seschrock src = rm->rm_col[c].rc_data; 305*2082Seschrock p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 306*2082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 307*2082Seschrock 308*2082Seschrock if (c == rm->rm_firstdatacol) { 309*2082Seschrock ASSERT(ccount == pcount); 310*2082Seschrock for (i = 0; i < ccount; i++, p++, src++) { 311*2082Seschrock *p = *src; 312*2082Seschrock } 313*2082Seschrock } else { 314*2082Seschrock ASSERT(ccount <= pcount); 315*2082Seschrock for (i = 0; i < ccount; i++, p++, src++) { 316*2082Seschrock *p ^= *src; 317*2082Seschrock } 318*2082Seschrock } 319*2082Seschrock } 320*2082Seschrock } 321*2082Seschrock 322*2082Seschrock static void 323*2082Seschrock vdev_raidz_generate_parity_pq(raidz_map_t *rm) 324789Sahrens { 325*2082Seschrock uint64_t *q, *p, *src, pcount, ccount, mask, i; 326*2082Seschrock int c; 327*2082Seschrock 328*2082Seschrock pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 329*2082Seschrock ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 330*2082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_size); 331*2082Seschrock 332*2082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 333*2082Seschrock src = rm->rm_col[c].rc_data; 334*2082Seschrock p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 335*2082Seschrock q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 336*2082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 337*2082Seschrock 338*2082Seschrock if (c == rm->rm_firstdatacol) { 339*2082Seschrock ASSERT(ccount == pcount || ccount == 0); 340*2082Seschrock for (i = 0; i < ccount; i++, p++, q++, src++) { 341*2082Seschrock *q = *src; 342*2082Seschrock *p = *src; 343*2082Seschrock } 344*2082Seschrock for (; i < pcount; i++, p++, q++, src++) { 345*2082Seschrock *q = 0; 346*2082Seschrock *p = 0; 347*2082Seschrock } 348*2082Seschrock } else { 349*2082Seschrock ASSERT(ccount <= pcount); 350789Sahrens 351*2082Seschrock /* 352*2082Seschrock * Rather than multiplying each byte individually (as 353*2082Seschrock * described above), we are able to handle 8 at once 354*2082Seschrock * by generating a mask based on the high bit in each 355*2082Seschrock * byte and using that to conditionally XOR in 0x1d. 356*2082Seschrock */ 357*2082Seschrock for (i = 0; i < ccount; i++, p++, q++, src++) { 358*2082Seschrock mask = *q & 0x8080808080808080ULL; 359*2082Seschrock mask = (mask << 1) - (mask >> 7); 360*2082Seschrock *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ 361*2082Seschrock (mask & 0x1d1d1d1d1d1d1d1dULL); 362*2082Seschrock *q ^= *src; 363*2082Seschrock *p ^= *src; 364*2082Seschrock } 365*2082Seschrock 366*2082Seschrock /* 367*2082Seschrock * Treat short columns as though they are full of 0s. 368*2082Seschrock */ 369*2082Seschrock for (; i < pcount; i++, q++) { 370*2082Seschrock mask = *q & 0x8080808080808080ULL; 371*2082Seschrock mask = (mask << 1) - (mask >> 7); 372*2082Seschrock *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ 373*2082Seschrock (mask & 0x1d1d1d1d1d1d1d1dULL); 374*2082Seschrock } 375*2082Seschrock } 376*2082Seschrock } 377*2082Seschrock } 378*2082Seschrock 379*2082Seschrock static void 380*2082Seschrock vdev_raidz_reconstruct_p(raidz_map_t *rm, int x) 381*2082Seschrock { 382*2082Seschrock uint64_t *dst, *src, xcount, ccount, count, i; 383*2082Seschrock int c; 384*2082Seschrock 385*2082Seschrock xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 386*2082Seschrock ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 387*2082Seschrock ASSERT(xcount > 0); 388*2082Seschrock 389*2082Seschrock src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 390*2082Seschrock dst = rm->rm_col[x].rc_data; 391*2082Seschrock for (i = 0; i < xcount; i++, dst++, src++) { 392*2082Seschrock *dst = *src; 393*2082Seschrock } 394*2082Seschrock 395*2082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 396789Sahrens src = rm->rm_col[c].rc_data; 397789Sahrens dst = rm->rm_col[x].rc_data; 398*2082Seschrock 399*2082Seschrock if (c == x) 400*2082Seschrock continue; 401*2082Seschrock 402*2082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 403*2082Seschrock count = MIN(ccount, xcount); 404*2082Seschrock 405*2082Seschrock for (i = 0; i < count; i++, dst++, src++) { 406*2082Seschrock *dst ^= *src; 407789Sahrens } 408789Sahrens } 409789Sahrens } 410789Sahrens 411*2082Seschrock static void 412*2082Seschrock vdev_raidz_reconstruct_q(raidz_map_t *rm, int x) 413*2082Seschrock { 414*2082Seschrock uint64_t *dst, *src, xcount, ccount, count, mask, i; 415*2082Seschrock uint8_t *b; 416*2082Seschrock int c, j, exp; 417*2082Seschrock 418*2082Seschrock xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 419*2082Seschrock ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 420*2082Seschrock 421*2082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 422*2082Seschrock src = rm->rm_col[c].rc_data; 423*2082Seschrock dst = rm->rm_col[x].rc_data; 424*2082Seschrock 425*2082Seschrock if (c == x) 426*2082Seschrock ccount = 0; 427*2082Seschrock else 428*2082Seschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 429*2082Seschrock 430*2082Seschrock count = MIN(ccount, xcount); 431*2082Seschrock 432*2082Seschrock if (c == rm->rm_firstdatacol) { 433*2082Seschrock for (i = 0; i < count; i++, dst++, src++) { 434*2082Seschrock *dst = *src; 435*2082Seschrock } 436*2082Seschrock for (; i < xcount; i++, dst++) { 437*2082Seschrock *dst = 0; 438*2082Seschrock } 439*2082Seschrock 440*2082Seschrock } else { 441*2082Seschrock /* 442*2082Seschrock * For an explanation of this, see the comment in 443*2082Seschrock * vdev_raidz_generate_parity_pq() above. 444*2082Seschrock */ 445*2082Seschrock for (i = 0; i < count; i++, dst++, src++) { 446*2082Seschrock mask = *dst & 0x8080808080808080ULL; 447*2082Seschrock mask = (mask << 1) - (mask >> 7); 448*2082Seschrock *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ 449*2082Seschrock (mask & 0x1d1d1d1d1d1d1d1dULL); 450*2082Seschrock *dst ^= *src; 451*2082Seschrock } 452*2082Seschrock 453*2082Seschrock for (; i < xcount; i++, dst++) { 454*2082Seschrock mask = *dst & 0x8080808080808080ULL; 455*2082Seschrock mask = (mask << 1) - (mask >> 7); 456*2082Seschrock *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ 457*2082Seschrock (mask & 0x1d1d1d1d1d1d1d1dULL); 458*2082Seschrock } 459*2082Seschrock } 460*2082Seschrock } 461*2082Seschrock 462*2082Seschrock src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 463*2082Seschrock dst = rm->rm_col[x].rc_data; 464*2082Seschrock exp = 255 - (rm->rm_cols - 1 - x); 465*2082Seschrock 466*2082Seschrock for (i = 0; i < xcount; i++, dst++, src++) { 467*2082Seschrock *dst ^= *src; 468*2082Seschrock for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 469*2082Seschrock *b = vdev_raidz_exp2(*b, exp); 470*2082Seschrock } 471*2082Seschrock } 472*2082Seschrock } 473*2082Seschrock 474*2082Seschrock static void 475*2082Seschrock vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y) 476*2082Seschrock { 477*2082Seschrock uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 478*2082Seschrock void *pdata, *qdata; 479*2082Seschrock uint64_t xsize, ysize, i; 480*2082Seschrock 481*2082Seschrock ASSERT(x < y); 482*2082Seschrock ASSERT(x >= rm->rm_firstdatacol); 483*2082Seschrock ASSERT(y < rm->rm_cols); 484*2082Seschrock 485*2082Seschrock ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 486*2082Seschrock 487*2082Seschrock /* 488*2082Seschrock * Move the parity data aside -- we're going to compute parity as 489*2082Seschrock * though columns x and y were full of zeros -- Pxy and Qxy. We want to 490*2082Seschrock * reuse the parity generation mechanism without trashing the actual 491*2082Seschrock * parity so we make those columns appear to be full of zeros by 492*2082Seschrock * setting their lengths to zero. 493*2082Seschrock */ 494*2082Seschrock pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 495*2082Seschrock qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 496*2082Seschrock xsize = rm->rm_col[x].rc_size; 497*2082Seschrock ysize = rm->rm_col[y].rc_size; 498*2082Seschrock 499*2082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_data = 500*2082Seschrock zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 501*2082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_data = 502*2082Seschrock zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 503*2082Seschrock rm->rm_col[x].rc_size = 0; 504*2082Seschrock rm->rm_col[y].rc_size = 0; 505*2082Seschrock 506*2082Seschrock vdev_raidz_generate_parity_pq(rm); 507*2082Seschrock 508*2082Seschrock rm->rm_col[x].rc_size = xsize; 509*2082Seschrock rm->rm_col[y].rc_size = ysize; 510*2082Seschrock 511*2082Seschrock p = pdata; 512*2082Seschrock q = qdata; 513*2082Seschrock pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 514*2082Seschrock qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 515*2082Seschrock xd = rm->rm_col[x].rc_data; 516*2082Seschrock yd = rm->rm_col[y].rc_data; 517*2082Seschrock 518*2082Seschrock /* 519*2082Seschrock * We now have: 520*2082Seschrock * Pxy = P + D_x + D_y 521*2082Seschrock * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 522*2082Seschrock * 523*2082Seschrock * We can then solve for D_x: 524*2082Seschrock * D_x = A * (P + Pxy) + B * (Q + Qxy) 525*2082Seschrock * where 526*2082Seschrock * A = 2^(x - y) * (2^(x - y) + 1)^-1 527*2082Seschrock * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 528*2082Seschrock * 529*2082Seschrock * With D_x in hand, we can easily solve for D_y: 530*2082Seschrock * D_y = P + Pxy + D_x 531*2082Seschrock */ 532*2082Seschrock 533*2082Seschrock a = vdev_raidz_pow2[255 + x - y]; 534*2082Seschrock b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 535*2082Seschrock tmp = 255 - vdev_raidz_log2[a ^ 1]; 536*2082Seschrock 537*2082Seschrock aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 538*2082Seschrock bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 539*2082Seschrock 540*2082Seschrock for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 541*2082Seschrock *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 542*2082Seschrock vdev_raidz_exp2(*q ^ *qxy, bexp); 543*2082Seschrock 544*2082Seschrock if (i < ysize) 545*2082Seschrock *yd = *p ^ *pxy ^ *xd; 546*2082Seschrock } 547*2082Seschrock 548*2082Seschrock zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 549*2082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_size); 550*2082Seschrock zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 551*2082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_size); 552*2082Seschrock 553*2082Seschrock /* 554*2082Seschrock * Restore the saved parity data. 555*2082Seschrock */ 556*2082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 557*2082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 558*2082Seschrock } 559*2082Seschrock 560*2082Seschrock 561789Sahrens static int 562789Sahrens vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) 563789Sahrens { 564789Sahrens vdev_t *cvd; 565*2082Seschrock uint64_t nparity = vd->vdev_nparity; 566789Sahrens int c, error; 567789Sahrens int lasterror = 0; 568789Sahrens int numerrors = 0; 569789Sahrens 570*2082Seschrock ASSERT(nparity > 0); 571*2082Seschrock 572*2082Seschrock if (nparity > VDEV_RAIDZ_MAXPARITY || 573*2082Seschrock vd->vdev_children < nparity + 1) { 574789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 575789Sahrens return (EINVAL); 576789Sahrens } 577789Sahrens 578789Sahrens for (c = 0; c < vd->vdev_children; c++) { 579789Sahrens cvd = vd->vdev_child[c]; 580789Sahrens 581789Sahrens if ((error = vdev_open(cvd)) != 0) { 582789Sahrens lasterror = error; 583789Sahrens numerrors++; 584789Sahrens continue; 585789Sahrens } 586789Sahrens 587789Sahrens *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 5881732Sbonwick *ashift = MAX(*ashift, cvd->vdev_ashift); 589789Sahrens } 590789Sahrens 591789Sahrens *asize *= vd->vdev_children; 592789Sahrens 593*2082Seschrock if (numerrors > nparity) { 594789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 595789Sahrens return (lasterror); 596789Sahrens } 597789Sahrens 598789Sahrens return (0); 599789Sahrens } 600789Sahrens 601789Sahrens static void 602789Sahrens vdev_raidz_close(vdev_t *vd) 603789Sahrens { 604789Sahrens int c; 605789Sahrens 606789Sahrens for (c = 0; c < vd->vdev_children; c++) 607789Sahrens vdev_close(vd->vdev_child[c]); 608789Sahrens } 609789Sahrens 610789Sahrens static uint64_t 611789Sahrens vdev_raidz_asize(vdev_t *vd, uint64_t psize) 612789Sahrens { 613789Sahrens uint64_t asize; 6141732Sbonwick uint64_t ashift = vd->vdev_top->vdev_ashift; 615789Sahrens uint64_t cols = vd->vdev_children; 616*2082Seschrock uint64_t nparity = vd->vdev_nparity; 617789Sahrens 6181732Sbonwick asize = ((psize - 1) >> ashift) + 1; 619*2082Seschrock asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 620*2082Seschrock asize = roundup(asize, nparity + 1) << ashift; 621789Sahrens 622789Sahrens return (asize); 623789Sahrens } 624789Sahrens 625789Sahrens static void 626789Sahrens vdev_raidz_child_done(zio_t *zio) 627789Sahrens { 628789Sahrens raidz_col_t *rc = zio->io_private; 629789Sahrens 630789Sahrens rc->rc_error = zio->io_error; 631789Sahrens rc->rc_tried = 1; 632789Sahrens rc->rc_skipped = 0; 633789Sahrens } 634789Sahrens 635789Sahrens static void 636789Sahrens vdev_raidz_repair_done(zio_t *zio) 637789Sahrens { 6381732Sbonwick ASSERT(zio->io_private == zio->io_parent); 6391732Sbonwick vdev_raidz_map_free(zio->io_private); 640789Sahrens } 641789Sahrens 642789Sahrens static void 643789Sahrens vdev_raidz_io_start(zio_t *zio) 644789Sahrens { 645789Sahrens vdev_t *vd = zio->io_vd; 6461732Sbonwick vdev_t *tvd = vd->vdev_top; 647789Sahrens vdev_t *cvd; 648789Sahrens blkptr_t *bp = zio->io_bp; 649789Sahrens raidz_map_t *rm; 650789Sahrens raidz_col_t *rc; 651789Sahrens int c; 652789Sahrens 653*2082Seschrock rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, 654*2082Seschrock vd->vdev_nparity); 655789Sahrens 6561775Sbillm ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 657789Sahrens 658789Sahrens if (zio->io_type == ZIO_TYPE_WRITE) { 659789Sahrens /* 660*2082Seschrock * Generate RAID parity in the first virtual columns. 661789Sahrens */ 662*2082Seschrock if (rm->rm_firstdatacol == 1) 663*2082Seschrock vdev_raidz_generate_parity_p(rm); 664*2082Seschrock else 665*2082Seschrock vdev_raidz_generate_parity_pq(rm); 666789Sahrens 667789Sahrens for (c = 0; c < rm->rm_cols; c++) { 668789Sahrens rc = &rm->rm_col[c]; 669*2082Seschrock cvd = vd->vdev_child[rc->rc_devidx]; 670789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 671789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 672789Sahrens zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, 673789Sahrens vdev_raidz_child_done, rc)); 674789Sahrens } 675789Sahrens zio_wait_children_done(zio); 676789Sahrens return; 677789Sahrens } 678789Sahrens 679789Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ); 680789Sahrens 681*2082Seschrock /* 682*2082Seschrock * Iterate over the columns in reverse order so that we hit the parity 683*2082Seschrock * last -- any errors along the way will force us to read the parity 684*2082Seschrock * data. 685*2082Seschrock */ 686789Sahrens for (c = rm->rm_cols - 1; c >= 0; c--) { 687789Sahrens rc = &rm->rm_col[c]; 688*2082Seschrock cvd = vd->vdev_child[rc->rc_devidx]; 689789Sahrens if (vdev_is_dead(cvd)) { 690*2082Seschrock if (c >= rm->rm_firstdatacol) 691*2082Seschrock rm->rm_missingdata++; 692*2082Seschrock else 693*2082Seschrock rm->rm_missingparity++; 694789Sahrens rc->rc_error = ENXIO; 695789Sahrens rc->rc_tried = 1; /* don't even try */ 696789Sahrens rc->rc_skipped = 1; 697789Sahrens continue; 698789Sahrens } 699789Sahrens if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) { 700*2082Seschrock if (c >= rm->rm_firstdatacol) 701*2082Seschrock rm->rm_missingdata++; 702*2082Seschrock else 703*2082Seschrock rm->rm_missingparity++; 704789Sahrens rc->rc_error = ESTALE; 705789Sahrens rc->rc_skipped = 1; 706789Sahrens continue; 707789Sahrens } 708*2082Seschrock if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 709789Sahrens (zio->io_flags & ZIO_FLAG_SCRUB)) { 710789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 711789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 712789Sahrens zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, 713789Sahrens vdev_raidz_child_done, rc)); 714789Sahrens } 715789Sahrens } 716789Sahrens 717789Sahrens zio_wait_children_done(zio); 718789Sahrens } 719789Sahrens 7201544Seschrock /* 7211544Seschrock * Report a checksum error for a child of a RAID-Z device. 7221544Seschrock */ 7231544Seschrock static void 7241544Seschrock raidz_checksum_error(zio_t *zio, raidz_col_t *rc) 7251544Seschrock { 726*2082Seschrock vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 7271544Seschrock dprintf_bp(zio->io_bp, "imputed checksum error on %s: ", 7281544Seschrock vdev_description(vd)); 7291544Seschrock 7301544Seschrock if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 7311544Seschrock mutex_enter(&vd->vdev_stat_lock); 7321544Seschrock vd->vdev_stat.vs_checksum_errors++; 7331544Seschrock mutex_exit(&vd->vdev_stat_lock); 7341544Seschrock } 7351544Seschrock 7361544Seschrock if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 7371544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 7381544Seschrock zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size); 7391544Seschrock } 7401544Seschrock 741*2082Seschrock /* 742*2082Seschrock * Generate the parity from the data columns. If we tried and were able to 743*2082Seschrock * read the parity without error, verify that the generated parity matches the 744*2082Seschrock * data we read. If it doesn't, we fire off a checksum error. Return the 745*2082Seschrock * number such failures. 746*2082Seschrock */ 747*2082Seschrock static int 748*2082Seschrock raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 749*2082Seschrock { 750*2082Seschrock void *orig[VDEV_RAIDZ_MAXPARITY]; 751*2082Seschrock int c, ret = 0; 752*2082Seschrock raidz_col_t *rc; 753*2082Seschrock 754*2082Seschrock for (c = 0; c < rm->rm_firstdatacol; c++) { 755*2082Seschrock rc = &rm->rm_col[c]; 756*2082Seschrock if (!rc->rc_tried || rc->rc_error != 0) 757*2082Seschrock continue; 758*2082Seschrock orig[c] = zio_buf_alloc(rc->rc_size); 759*2082Seschrock bcopy(rc->rc_data, orig[c], rc->rc_size); 760*2082Seschrock } 761*2082Seschrock 762*2082Seschrock if (rm->rm_firstdatacol == 1) 763*2082Seschrock vdev_raidz_generate_parity_p(rm); 764*2082Seschrock else 765*2082Seschrock vdev_raidz_generate_parity_pq(rm); 766*2082Seschrock 767*2082Seschrock for (c = 0; c < rm->rm_firstdatacol; c++) { 768*2082Seschrock rc = &rm->rm_col[c]; 769*2082Seschrock if (!rc->rc_tried || rc->rc_error != 0) 770*2082Seschrock continue; 771*2082Seschrock if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 772*2082Seschrock raidz_checksum_error(zio, rc); 773*2082Seschrock rc->rc_error = ECKSUM; 774*2082Seschrock ret++; 775*2082Seschrock } 776*2082Seschrock zio_buf_free(orig[c], rc->rc_size); 777*2082Seschrock } 778*2082Seschrock 779*2082Seschrock return (ret); 780*2082Seschrock } 781*2082Seschrock 782*2082Seschrock static uint64_t raidz_corrected_p; 783*2082Seschrock static uint64_t raidz_corrected_q; 784*2082Seschrock static uint64_t raidz_corrected_pq; 7851544Seschrock 786789Sahrens static void 787789Sahrens vdev_raidz_io_done(zio_t *zio) 788789Sahrens { 789789Sahrens vdev_t *vd = zio->io_vd; 790789Sahrens vdev_t *cvd; 791789Sahrens raidz_map_t *rm = zio->io_vsd; 792*2082Seschrock raidz_col_t *rc, *rc1; 793789Sahrens int unexpected_errors = 0; 794*2082Seschrock int parity_errors = 0; 795*2082Seschrock int data_errors = 0; 796*2082Seschrock int n, c, c1; 797789Sahrens 7981775Sbillm ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 799789Sahrens 800789Sahrens zio->io_error = 0; 801789Sahrens zio->io_numerrors = 0; 802789Sahrens 803*2082Seschrock ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 804*2082Seschrock ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 805*2082Seschrock 806789Sahrens for (c = 0; c < rm->rm_cols; c++) { 807789Sahrens rc = &rm->rm_col[c]; 808789Sahrens 809789Sahrens /* 810789Sahrens * We preserve any EIOs because those may be worth retrying; 811789Sahrens * whereas ECKSUM and ENXIO are more likely to be persistent. 812789Sahrens */ 813789Sahrens if (rc->rc_error) { 814789Sahrens if (zio->io_error != EIO) 815789Sahrens zio->io_error = rc->rc_error; 816*2082Seschrock 817*2082Seschrock if (c < rm->rm_firstdatacol) 818*2082Seschrock parity_errors++; 819*2082Seschrock else 820*2082Seschrock data_errors++; 821*2082Seschrock 822789Sahrens if (!rc->rc_skipped) 823789Sahrens unexpected_errors++; 824*2082Seschrock 825789Sahrens zio->io_numerrors++; 826789Sahrens } 827789Sahrens } 828789Sahrens 829789Sahrens if (zio->io_type == ZIO_TYPE_WRITE) { 830789Sahrens /* 831789Sahrens * If this is not a failfast write, and we were able to 832789Sahrens * write enough columns to reconstruct the data, good enough. 833789Sahrens */ 834789Sahrens /* XXPOLICY */ 835789Sahrens if (zio->io_numerrors <= rm->rm_firstdatacol && 836789Sahrens !(zio->io_flags & ZIO_FLAG_FAILFAST)) 837789Sahrens zio->io_error = 0; 838789Sahrens 839789Sahrens vdev_raidz_map_free(zio); 840789Sahrens zio_next_stage(zio); 841789Sahrens return; 842789Sahrens } 843789Sahrens 844789Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ); 845*2082Seschrock /* 846*2082Seschrock * There are three potential phases for a read: 847*2082Seschrock * 1. produce valid data from the columns read 848*2082Seschrock * 2. read all disks and try again 849*2082Seschrock * 3. perform combinatorial reconstruction 850*2082Seschrock * 851*2082Seschrock * Each phase is progressively both more expensive and less likely to 852*2082Seschrock * occur. If we encounter more errors than we can repair or all phases 853*2082Seschrock * fail, we have no choice but to return an error. 854*2082Seschrock */ 855789Sahrens 856789Sahrens /* 857*2082Seschrock * If the number of errors we saw was correctable -- less than or equal 858*2082Seschrock * to the number of parity disks -- attempt to produce data that has a 859*2082Seschrock * valid checksum. Naturally, zero errors falls into this case. 860789Sahrens */ 861*2082Seschrock if (zio->io_numerrors <= rm->rm_firstdatacol) { 862*2082Seschrock switch (data_errors) { 863*2082Seschrock case 0: 864*2082Seschrock if (zio_checksum_error(zio) == 0) { 865*2082Seschrock zio->io_error = 0; 866*2082Seschrock n = raidz_parity_verify(zio, rm); 867*2082Seschrock unexpected_errors += n; 868*2082Seschrock ASSERT(parity_errors + n <= 869*2082Seschrock rm->rm_firstdatacol); 870*2082Seschrock goto done; 871*2082Seschrock } 872*2082Seschrock break; 873*2082Seschrock 874*2082Seschrock case 1: 875*2082Seschrock ASSERT(parity_errors < rm->rm_firstdatacol); 876*2082Seschrock 877*2082Seschrock /* 878*2082Seschrock * Find the column that reported the error. 879*2082Seschrock */ 880*2082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 881*2082Seschrock rc = &rm->rm_col[c]; 882*2082Seschrock if (rc->rc_error != 0) 883*2082Seschrock break; 884*2082Seschrock } 885*2082Seschrock ASSERT(c != rm->rm_cols); 886*2082Seschrock ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || 887*2082Seschrock rc->rc_error == ESTALE); 888*2082Seschrock 889*2082Seschrock if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { 890*2082Seschrock vdev_raidz_reconstruct_p(rm, c); 891*2082Seschrock } else { 892*2082Seschrock ASSERT(rm->rm_firstdatacol > 1); 893*2082Seschrock vdev_raidz_reconstruct_q(rm, c); 894*2082Seschrock } 895*2082Seschrock 896*2082Seschrock if (zio_checksum_error(zio) == 0) { 897*2082Seschrock zio->io_error = 0; 898*2082Seschrock if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) 899*2082Seschrock atomic_inc_64(&raidz_corrected_p); 900*2082Seschrock else 901*2082Seschrock atomic_inc_64(&raidz_corrected_q); 902789Sahrens 903*2082Seschrock /* 904*2082Seschrock * If there's more than one parity disk, 905*2082Seschrock * confirm that the parity disk not used above 906*2082Seschrock * has the correct data. 907*2082Seschrock */ 908*2082Seschrock if (rm->rm_firstdatacol > 1) { 909*2082Seschrock n = raidz_parity_verify(zio, rm); 910*2082Seschrock unexpected_errors += n; 911*2082Seschrock ASSERT(parity_errors + n <= 912*2082Seschrock rm->rm_firstdatacol); 913*2082Seschrock } 914*2082Seschrock 915*2082Seschrock goto done; 916*2082Seschrock } 917*2082Seschrock break; 918*2082Seschrock 919*2082Seschrock case 2: 920*2082Seschrock /* 921*2082Seschrock * Find the two columns that reported errors. 922*2082Seschrock */ 923*2082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 924*2082Seschrock rc = &rm->rm_col[c]; 925*2082Seschrock if (rc->rc_error != 0) 926*2082Seschrock break; 927789Sahrens } 928*2082Seschrock ASSERT(c != rm->rm_cols); 929*2082Seschrock ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || 930*2082Seschrock rc->rc_error == ESTALE); 931*2082Seschrock 932*2082Seschrock for (c1 = c++; c < rm->rm_cols; c++) { 933*2082Seschrock rc = &rm->rm_col[c]; 934*2082Seschrock if (rc->rc_error != 0) 935*2082Seschrock break; 936*2082Seschrock } 937*2082Seschrock ASSERT(c != rm->rm_cols); 938*2082Seschrock ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || 939*2082Seschrock rc->rc_error == ESTALE); 940789Sahrens 941*2082Seschrock vdev_raidz_reconstruct_pq(rm, c1, c); 942*2082Seschrock 943*2082Seschrock if (zio_checksum_error(zio) == 0) { 944*2082Seschrock zio->io_error = 0; 945*2082Seschrock atomic_inc_64(&raidz_corrected_pq); 946*2082Seschrock 947*2082Seschrock goto done; 948*2082Seschrock } 949*2082Seschrock break; 950*2082Seschrock 951*2082Seschrock default: 952*2082Seschrock ASSERT(rm->rm_firstdatacol <= 2); 953*2082Seschrock ASSERT(0); 954789Sahrens } 955789Sahrens } 956789Sahrens 957789Sahrens /* 958*2082Seschrock * This isn't a typical situation -- either we got a read error or 959*2082Seschrock * a child silently returned bad data. Read every block so we can 960*2082Seschrock * try again with as much data and parity as we can track down. If 961*2082Seschrock * we've already been through once before, all children will be marked 962*2082Seschrock * as tried so we'll proceed to combinatorial reconstruction. 963789Sahrens */ 964789Sahrens unexpected_errors = 1; 965*2082Seschrock rm->rm_missingdata = 0; 966*2082Seschrock rm->rm_missingparity = 0; 967789Sahrens 968*2082Seschrock for (c = 0; c < rm->rm_cols; c++) { 969*2082Seschrock if (rm->rm_col[c].rc_tried) 970*2082Seschrock continue; 971789Sahrens 972789Sahrens zio->io_error = 0; 973789Sahrens zio_vdev_io_redone(zio); 974*2082Seschrock do { 975789Sahrens rc = &rm->rm_col[c]; 976789Sahrens if (rc->rc_tried) 977789Sahrens continue; 978789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, 979*2082Seschrock vd->vdev_child[rc->rc_devidx], 980789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 981789Sahrens zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, 982789Sahrens vdev_raidz_child_done, rc)); 983*2082Seschrock } while (++c < rm->rm_cols); 984*2082Seschrock dprintf("rereading\n"); 985789Sahrens zio_wait_children_done(zio); 986789Sahrens return; 987789Sahrens } 988789Sahrens 989789Sahrens /* 990*2082Seschrock * At this point we've attempted to reconstruct the data given the 991*2082Seschrock * errors we detected, and we've attempted to read all columns. There 992*2082Seschrock * must, therefore, be one or more additional problems -- silent errors 993*2082Seschrock * resulting in invalid data rather than explicit I/O errors resulting 994*2082Seschrock * in absent data. Before we attempt combinatorial reconstruction make 995*2082Seschrock * sure we have a chance of coming up with the right answer. 996789Sahrens */ 997*2082Seschrock if (zio->io_numerrors >= rm->rm_firstdatacol) { 998789Sahrens ASSERT(zio->io_error != 0); 999789Sahrens goto done; 1000789Sahrens } 1001789Sahrens 1002*2082Seschrock if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { 1003*2082Seschrock /* 1004*2082Seschrock * Attempt to reconstruct the data from parity P. 1005*2082Seschrock */ 1006*2082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1007*2082Seschrock void *orig; 1008*2082Seschrock rc = &rm->rm_col[c]; 1009*2082Seschrock 1010*2082Seschrock orig = zio_buf_alloc(rc->rc_size); 1011*2082Seschrock bcopy(rc->rc_data, orig, rc->rc_size); 1012*2082Seschrock vdev_raidz_reconstruct_p(rm, c); 1013*2082Seschrock 1014*2082Seschrock if (zio_checksum_error(zio) == 0) { 1015*2082Seschrock zio_buf_free(orig, rc->rc_size); 1016*2082Seschrock zio->io_error = 0; 1017*2082Seschrock atomic_inc_64(&raidz_corrected_p); 1018*2082Seschrock 1019*2082Seschrock /* 1020*2082Seschrock * If this child didn't know that it returned 1021*2082Seschrock * bad data, inform it. 1022*2082Seschrock */ 1023*2082Seschrock if (rc->rc_tried && rc->rc_error == 0) 1024*2082Seschrock raidz_checksum_error(zio, rc); 1025*2082Seschrock rc->rc_error = ECKSUM; 1026*2082Seschrock goto done; 1027*2082Seschrock } 1028*2082Seschrock 1029*2082Seschrock bcopy(orig, rc->rc_data, rc->rc_size); 1030*2082Seschrock zio_buf_free(orig, rc->rc_size); 1031*2082Seschrock } 1032*2082Seschrock } 1033*2082Seschrock 1034*2082Seschrock if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { 1035*2082Seschrock /* 1036*2082Seschrock * Attempt to reconstruct the data from parity Q. 1037*2082Seschrock */ 1038*2082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1039*2082Seschrock void *orig; 1040*2082Seschrock rc = &rm->rm_col[c]; 1041*2082Seschrock 1042*2082Seschrock orig = zio_buf_alloc(rc->rc_size); 1043*2082Seschrock bcopy(rc->rc_data, orig, rc->rc_size); 1044*2082Seschrock vdev_raidz_reconstruct_q(rm, c); 1045*2082Seschrock 1046*2082Seschrock if (zio_checksum_error(zio) == 0) { 1047*2082Seschrock zio_buf_free(orig, rc->rc_size); 1048789Sahrens zio->io_error = 0; 1049*2082Seschrock atomic_inc_64(&raidz_corrected_q); 1050*2082Seschrock 1051*2082Seschrock /* 1052*2082Seschrock * If this child didn't know that it returned 1053*2082Seschrock * bad data, inform it. 1054*2082Seschrock */ 1055*2082Seschrock if (rc->rc_tried && rc->rc_error == 0) 1056*2082Seschrock raidz_checksum_error(zio, rc); 1057*2082Seschrock rc->rc_error = ECKSUM; 1058*2082Seschrock goto done; 1059*2082Seschrock } 1060*2082Seschrock 1061*2082Seschrock bcopy(orig, rc->rc_data, rc->rc_size); 1062*2082Seschrock zio_buf_free(orig, rc->rc_size); 1063*2082Seschrock } 1064*2082Seschrock } 1065*2082Seschrock 1066*2082Seschrock if (rm->rm_firstdatacol > 1 && 1067*2082Seschrock rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 && 1068*2082Seschrock rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { 1069*2082Seschrock /* 1070*2082Seschrock * Attempt to reconstruct the data from both P and Q. 1071*2082Seschrock */ 1072*2082Seschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) { 1073*2082Seschrock void *orig, *orig1; 1074*2082Seschrock rc = &rm->rm_col[c]; 1075*2082Seschrock 1076*2082Seschrock orig = zio_buf_alloc(rc->rc_size); 1077*2082Seschrock bcopy(rc->rc_data, orig, rc->rc_size); 1078*2082Seschrock 1079*2082Seschrock for (c1 = c + 1; c1 < rm->rm_cols; c1++) { 1080*2082Seschrock rc1 = &rm->rm_col[c1]; 1081*2082Seschrock 1082*2082Seschrock orig1 = zio_buf_alloc(rc1->rc_size); 1083*2082Seschrock bcopy(rc1->rc_data, orig1, rc1->rc_size); 1084*2082Seschrock 1085*2082Seschrock vdev_raidz_reconstruct_pq(rm, c, c1); 1086*2082Seschrock 1087*2082Seschrock if (zio_checksum_error(zio) == 0) { 1088*2082Seschrock zio_buf_free(orig, rc->rc_size); 1089*2082Seschrock zio_buf_free(orig1, rc1->rc_size); 1090*2082Seschrock zio->io_error = 0; 1091*2082Seschrock atomic_inc_64(&raidz_corrected_pq); 1092*2082Seschrock 1093*2082Seschrock /* 1094*2082Seschrock * If these children didn't know they 1095*2082Seschrock * returned bad data, inform them. 1096*2082Seschrock */ 1097*2082Seschrock if (rc->rc_tried && rc->rc_error == 0) 1098*2082Seschrock raidz_checksum_error(zio, rc); 1099*2082Seschrock if (rc1->rc_tried && rc1->rc_error == 0) 1100*2082Seschrock raidz_checksum_error(zio, rc1); 1101*2082Seschrock 1102*2082Seschrock rc->rc_error = ECKSUM; 1103*2082Seschrock rc1->rc_error = ECKSUM; 1104*2082Seschrock 1105*2082Seschrock goto done; 1106*2082Seschrock } 1107*2082Seschrock 1108*2082Seschrock bcopy(orig1, rc1->rc_data, rc1->rc_size); 1109*2082Seschrock zio_buf_free(orig1, rc1->rc_size); 1110*2082Seschrock } 1111*2082Seschrock 1112*2082Seschrock bcopy(orig, rc->rc_data, rc->rc_size); 1113*2082Seschrock zio_buf_free(orig, rc->rc_size); 1114789Sahrens } 1115789Sahrens } 1116789Sahrens 1117789Sahrens /* 1118*2082Seschrock * All combinations failed to checksum. Generate checksum ereports for 1119*2082Seschrock * all children. 1120789Sahrens */ 1121789Sahrens zio->io_error = ECKSUM; 11221544Seschrock if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 11231544Seschrock for (c = 0; c < rm->rm_cols; c++) { 11241544Seschrock rc = &rm->rm_col[c]; 11251544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 1126*2082Seschrock zio->io_spa, vd->vdev_child[rc->rc_devidx], zio, 11271544Seschrock rc->rc_offset, rc->rc_size); 11281544Seschrock } 11291544Seschrock } 1130789Sahrens 1131789Sahrens done: 1132789Sahrens zio_checksum_verified(zio); 1133789Sahrens 1134789Sahrens if (zio->io_error == 0 && (spa_mode & FWRITE) && 1135789Sahrens (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 11361732Sbonwick zio_t *rio; 11371732Sbonwick 1138789Sahrens /* 1139789Sahrens * Use the good data we have in hand to repair damaged children. 11401732Sbonwick * 11411732Sbonwick * We issue all repair I/Os as children of 'rio' to arrange 11421732Sbonwick * that vdev_raidz_map_free(zio) will be invoked after all 11431732Sbonwick * repairs complete, but before we advance to the next stage. 1144789Sahrens */ 11451732Sbonwick rio = zio_null(zio, zio->io_spa, 11461732Sbonwick vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL); 11471732Sbonwick 1148789Sahrens for (c = 0; c < rm->rm_cols; c++) { 1149789Sahrens rc = &rm->rm_col[c]; 1150*2082Seschrock cvd = vd->vdev_child[rc->rc_devidx]; 1151789Sahrens 11521732Sbonwick if (rc->rc_error == 0) 11531732Sbonwick continue; 11541732Sbonwick 11551732Sbonwick dprintf("%s resilvered %s @ 0x%llx error %d\n", 11561732Sbonwick vdev_description(vd), 11571732Sbonwick vdev_description(cvd), 11581732Sbonwick zio->io_offset, rc->rc_error); 1159789Sahrens 11601732Sbonwick zio_nowait(zio_vdev_child_io(rio, NULL, cvd, 11611732Sbonwick rc->rc_offset, rc->rc_data, rc->rc_size, 11621732Sbonwick ZIO_TYPE_WRITE, zio->io_priority, 1163*2082Seschrock ZIO_FLAG_IO_REPAIR | ZIO_FLAG_DONT_PROPAGATE | 1164*2082Seschrock ZIO_FLAG_CANFAIL, NULL, NULL)); 11651732Sbonwick } 1166789Sahrens 11671732Sbonwick zio_nowait(rio); 11681732Sbonwick zio_wait_children_done(zio); 11691732Sbonwick return; 1170789Sahrens } 1171789Sahrens 1172789Sahrens vdev_raidz_map_free(zio); 1173789Sahrens zio_next_stage(zio); 1174789Sahrens } 1175789Sahrens 1176789Sahrens static void 1177789Sahrens vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 1178789Sahrens { 1179*2082Seschrock if (faulted > vd->vdev_nparity) 11801544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 11811544Seschrock VDEV_AUX_NO_REPLICAS); 1182789Sahrens else if (degraded + faulted != 0) 11831544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 1184789Sahrens else 11851544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 1186789Sahrens } 1187789Sahrens 1188789Sahrens vdev_ops_t vdev_raidz_ops = { 1189789Sahrens vdev_raidz_open, 1190789Sahrens vdev_raidz_close, 1191789Sahrens vdev_raidz_asize, 1192789Sahrens vdev_raidz_io_start, 1193789Sahrens vdev_raidz_io_done, 1194789Sahrens vdev_raidz_state_change, 1195789Sahrens VDEV_TYPE_RAIDZ, /* name of this vdev type */ 1196789Sahrens B_FALSE /* not a leaf vdev */ 1197789Sahrens }; 1198