1*789Sahrens /* 2*789Sahrens * CDDL HEADER START 3*789Sahrens * 4*789Sahrens * The contents of this file are subject to the terms of the 5*789Sahrens * Common Development and Distribution License, Version 1.0 only 6*789Sahrens * (the "License"). You may not use this file except in compliance 7*789Sahrens * with the License. 8*789Sahrens * 9*789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10*789Sahrens * or http://www.opensolaris.org/os/licensing. 11*789Sahrens * See the License for the specific language governing permissions 12*789Sahrens * and limitations under the License. 13*789Sahrens * 14*789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 15*789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16*789Sahrens * If applicable, add the following below this CDDL HEADER, with the 17*789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 18*789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 19*789Sahrens * 20*789Sahrens * CDDL HEADER END 21*789Sahrens */ 22*789Sahrens /* 23*789Sahrens * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24*789Sahrens * Use is subject to license terms. 25*789Sahrens */ 26*789Sahrens 27*789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28*789Sahrens 29*789Sahrens #include <sys/zfs_context.h> 30*789Sahrens #include <sys/spa.h> 31*789Sahrens #include <sys/vdev_impl.h> 32*789Sahrens #include <sys/zio.h> 33*789Sahrens #include <sys/zio_checksum.h> 34*789Sahrens #include <sys/fs/zfs.h> 35*789Sahrens 36*789Sahrens /* 37*789Sahrens * Virtual device vector for RAID-Z. 38*789Sahrens */ 39*789Sahrens 40*789Sahrens /* 41*789Sahrens * We currently allow up to two-way replication (i.e. single-fault 42*789Sahrens * reconstruction) models in RAID-Z vdevs. The blocks in such vdevs 43*789Sahrens * must all be multiples of two times the leaf vdev blocksize. 44*789Sahrens */ 45*789Sahrens #define VDEV_RAIDZ_ALIGN 2ULL 46*789Sahrens 47*789Sahrens typedef struct raidz_col { 48*789Sahrens uint64_t rc_col; 49*789Sahrens uint64_t rc_offset; 50*789Sahrens uint64_t rc_size; 51*789Sahrens void *rc_data; 52*789Sahrens int rc_error; 53*789Sahrens short rc_tried; 54*789Sahrens short rc_skipped; 55*789Sahrens } raidz_col_t; 56*789Sahrens 57*789Sahrens typedef struct raidz_map { 58*789Sahrens uint64_t rm_cols; 59*789Sahrens uint64_t rm_bigcols; 60*789Sahrens uint64_t rm_asize; 61*789Sahrens int rm_missing_child; 62*789Sahrens int rm_type; 63*789Sahrens int rm_firstdatacol; 64*789Sahrens raidz_col_t rm_col[1]; 65*789Sahrens } raidz_map_t; 66*789Sahrens 67*789Sahrens #define RAIDZ_SINGLE 0 68*789Sahrens #define RAIDZ_PARITY 1 69*789Sahrens 70*789Sahrens static raidz_map_t * 71*789Sahrens vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, 72*789Sahrens int raid_type) 73*789Sahrens { 74*789Sahrens raidz_map_t *rm; 75*789Sahrens uint64_t b = zio->io_offset >> unit_shift; 76*789Sahrens uint64_t s = zio->io_size >> unit_shift; 77*789Sahrens uint64_t f = b % dcols; 78*789Sahrens uint64_t o = (b / dcols) << unit_shift; 79*789Sahrens uint64_t q, r, c, bc, col, acols, coff; 80*789Sahrens int firstdatacol; 81*789Sahrens 82*789Sahrens switch (raid_type) { 83*789Sahrens case RAIDZ_SINGLE: 84*789Sahrens q = s / dcols; 85*789Sahrens r = s - q * dcols; 86*789Sahrens bc = r; 87*789Sahrens firstdatacol = 0; 88*789Sahrens break; 89*789Sahrens case RAIDZ_PARITY: 90*789Sahrens q = s / (dcols - 1); 91*789Sahrens r = s - q * (dcols - 1); 92*789Sahrens bc = r + !!r; 93*789Sahrens firstdatacol = 1; 94*789Sahrens break; 95*789Sahrens } 96*789Sahrens 97*789Sahrens acols = (q == 0 ? bc : dcols); 98*789Sahrens 99*789Sahrens rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP); 100*789Sahrens 101*789Sahrens rm->rm_cols = acols; 102*789Sahrens rm->rm_bigcols = bc; 103*789Sahrens rm->rm_asize = 0; 104*789Sahrens rm->rm_missing_child = -1; 105*789Sahrens rm->rm_type = raid_type; 106*789Sahrens rm->rm_firstdatacol = firstdatacol; 107*789Sahrens 108*789Sahrens for (c = 0; c < acols; c++) { 109*789Sahrens col = f + c; 110*789Sahrens coff = o; 111*789Sahrens if (col >= dcols) { 112*789Sahrens col -= dcols; 113*789Sahrens coff += 1ULL << unit_shift; 114*789Sahrens } 115*789Sahrens rm->rm_col[c].rc_col = col; 116*789Sahrens rm->rm_col[c].rc_offset = coff; 117*789Sahrens rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift; 118*789Sahrens rm->rm_col[c].rc_data = NULL; 119*789Sahrens rm->rm_col[c].rc_error = 0; 120*789Sahrens rm->rm_col[c].rc_tried = 0; 121*789Sahrens rm->rm_col[c].rc_skipped = 0; 122*789Sahrens rm->rm_asize += rm->rm_col[c].rc_size; 123*789Sahrens } 124*789Sahrens 125*789Sahrens rm->rm_asize = P2ROUNDUP(rm->rm_asize, VDEV_RAIDZ_ALIGN << unit_shift); 126*789Sahrens 127*789Sahrens for (c = 0; c < rm->rm_firstdatacol; c++) 128*789Sahrens rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); 129*789Sahrens 130*789Sahrens rm->rm_col[c].rc_data = zio->io_data; 131*789Sahrens 132*789Sahrens for (c = c + 1; c < acols; c++) 133*789Sahrens rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 134*789Sahrens rm->rm_col[c - 1].rc_size; 135*789Sahrens 136*789Sahrens if (raid_type == RAIDZ_PARITY) { 137*789Sahrens /* 138*789Sahrens * To prevent hot parity disks, switch the parity and data 139*789Sahrens * columns every 1MB. 140*789Sahrens */ 141*789Sahrens ASSERT(rm->rm_cols >= 2); 142*789Sahrens ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 143*789Sahrens 144*789Sahrens if (zio->io_offset & (1ULL << 20)) { 145*789Sahrens col = rm->rm_col[0].rc_col; 146*789Sahrens o = rm->rm_col[0].rc_offset; 147*789Sahrens rm->rm_col[0].rc_col = rm->rm_col[1].rc_col; 148*789Sahrens rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 149*789Sahrens rm->rm_col[1].rc_col = col; 150*789Sahrens rm->rm_col[1].rc_offset = o; 151*789Sahrens } 152*789Sahrens } 153*789Sahrens 154*789Sahrens zio->io_vsd = rm; 155*789Sahrens return (rm); 156*789Sahrens } 157*789Sahrens 158*789Sahrens static void 159*789Sahrens vdev_raidz_map_free(zio_t *zio) 160*789Sahrens { 161*789Sahrens raidz_map_t *rm = zio->io_vsd; 162*789Sahrens int c; 163*789Sahrens 164*789Sahrens for (c = 0; c < rm->rm_firstdatacol; c++) 165*789Sahrens zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 166*789Sahrens 167*789Sahrens kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols])); 168*789Sahrens zio->io_vsd = NULL; 169*789Sahrens } 170*789Sahrens 171*789Sahrens static void 172*789Sahrens vdev_raidz_reconstruct(raidz_map_t *rm, int x) 173*789Sahrens { 174*789Sahrens uint64_t *dst, *src, count, xsize, csize; 175*789Sahrens int i, c; 176*789Sahrens 177*789Sahrens for (c = 0; c < rm->rm_cols; c++) { 178*789Sahrens if (c == x) 179*789Sahrens continue; 180*789Sahrens src = rm->rm_col[c].rc_data; 181*789Sahrens dst = rm->rm_col[x].rc_data; 182*789Sahrens csize = rm->rm_col[c].rc_size; 183*789Sahrens xsize = rm->rm_col[x].rc_size; 184*789Sahrens count = MIN(csize, xsize) / sizeof (uint64_t); 185*789Sahrens if (c == !x) { 186*789Sahrens /* 187*789Sahrens * The initial copy happens at either c == 0 or c == 1. 188*789Sahrens * Both of these columns are 'big' columns, so we'll 189*789Sahrens * definitely initialize all of column x. 190*789Sahrens */ 191*789Sahrens ASSERT3U(xsize, <=, csize); 192*789Sahrens for (i = 0; i < count; i++) 193*789Sahrens *dst++ = *src++; 194*789Sahrens } else { 195*789Sahrens for (i = 0; i < count; i++) 196*789Sahrens *dst++ ^= *src++; 197*789Sahrens } 198*789Sahrens } 199*789Sahrens } 200*789Sahrens 201*789Sahrens static int 202*789Sahrens vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) 203*789Sahrens { 204*789Sahrens vdev_t *cvd; 205*789Sahrens int c, error; 206*789Sahrens int lasterror = 0; 207*789Sahrens int numerrors = 0; 208*789Sahrens 209*789Sahrens /* 210*789Sahrens * XXX -- minimum children should be raid-type-specific 211*789Sahrens */ 212*789Sahrens if (vd->vdev_children < 2) { 213*789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 214*789Sahrens return (EINVAL); 215*789Sahrens } 216*789Sahrens 217*789Sahrens for (c = 0; c < vd->vdev_children; c++) { 218*789Sahrens cvd = vd->vdev_child[c]; 219*789Sahrens 220*789Sahrens if ((error = vdev_open(cvd)) != 0) { 221*789Sahrens lasterror = error; 222*789Sahrens numerrors++; 223*789Sahrens continue; 224*789Sahrens } 225*789Sahrens 226*789Sahrens *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 227*789Sahrens *ashift = cvd->vdev_ashift; 228*789Sahrens } 229*789Sahrens 230*789Sahrens *asize *= vd->vdev_children; 231*789Sahrens 232*789Sahrens if (numerrors > 1) { 233*789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 234*789Sahrens return (lasterror); 235*789Sahrens } 236*789Sahrens 237*789Sahrens return (0); 238*789Sahrens } 239*789Sahrens 240*789Sahrens static void 241*789Sahrens vdev_raidz_close(vdev_t *vd) 242*789Sahrens { 243*789Sahrens int c; 244*789Sahrens 245*789Sahrens for (c = 0; c < vd->vdev_children; c++) 246*789Sahrens vdev_close(vd->vdev_child[c]); 247*789Sahrens } 248*789Sahrens 249*789Sahrens static uint64_t 250*789Sahrens vdev_raidz_asize(vdev_t *vd, uint64_t psize) 251*789Sahrens { 252*789Sahrens uint64_t asize; 253*789Sahrens uint64_t cols = vd->vdev_children; 254*789Sahrens 255*789Sahrens /* 256*789Sahrens * These calculations assume RAIDZ_PARITY. 257*789Sahrens */ 258*789Sahrens asize = psize >> vd->vdev_ashift; 259*789Sahrens asize += (asize + cols - 2) / (cols - 1); 260*789Sahrens asize = P2ROUNDUP(asize, VDEV_RAIDZ_ALIGN) << vd->vdev_ashift; 261*789Sahrens 262*789Sahrens return (asize); 263*789Sahrens } 264*789Sahrens 265*789Sahrens static void 266*789Sahrens vdev_raidz_child_done(zio_t *zio) 267*789Sahrens { 268*789Sahrens raidz_col_t *rc = zio->io_private; 269*789Sahrens 270*789Sahrens rc->rc_error = zio->io_error; 271*789Sahrens rc->rc_tried = 1; 272*789Sahrens rc->rc_skipped = 0; 273*789Sahrens } 274*789Sahrens 275*789Sahrens static void 276*789Sahrens vdev_raidz_repair_done(zio_t *zio) 277*789Sahrens { 278*789Sahrens zio_buf_free(zio->io_data, zio->io_size); 279*789Sahrens } 280*789Sahrens 281*789Sahrens static void 282*789Sahrens vdev_raidz_io_start(zio_t *zio) 283*789Sahrens { 284*789Sahrens vdev_t *vd = zio->io_vd; 285*789Sahrens vdev_t *cvd; 286*789Sahrens blkptr_t *bp = zio->io_bp; 287*789Sahrens raidz_map_t *rm; 288*789Sahrens raidz_col_t *rc; 289*789Sahrens int c; 290*789Sahrens 291*789Sahrens rm = vdev_raidz_map_alloc(zio, vd->vdev_ashift, vd->vdev_children, 292*789Sahrens RAIDZ_PARITY); 293*789Sahrens 294*789Sahrens if (DVA_GET_GANG(ZIO_GET_DVA(zio))) { 295*789Sahrens ASSERT3U(rm->rm_asize, ==, 296*789Sahrens vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE)); 297*789Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 298*789Sahrens } else { 299*789Sahrens ASSERT3U(rm->rm_asize, ==, DVA_GET_ASIZE(ZIO_GET_DVA(zio))); 300*789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 301*789Sahrens } 302*789Sahrens 303*789Sahrens if (zio->io_type == ZIO_TYPE_WRITE) { 304*789Sahrens 305*789Sahrens /* 306*789Sahrens * Generate RAID parity in virtual column 0. 307*789Sahrens */ 308*789Sahrens vdev_raidz_reconstruct(rm, 0); 309*789Sahrens 310*789Sahrens for (c = 0; c < rm->rm_cols; c++) { 311*789Sahrens rc = &rm->rm_col[c]; 312*789Sahrens cvd = vd->vdev_child[rc->rc_col]; 313*789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 314*789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 315*789Sahrens zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, 316*789Sahrens vdev_raidz_child_done, rc)); 317*789Sahrens } 318*789Sahrens zio_wait_children_done(zio); 319*789Sahrens return; 320*789Sahrens } 321*789Sahrens 322*789Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ); 323*789Sahrens 324*789Sahrens for (c = rm->rm_cols - 1; c >= 0; c--) { 325*789Sahrens rc = &rm->rm_col[c]; 326*789Sahrens cvd = vd->vdev_child[rc->rc_col]; 327*789Sahrens if (vdev_is_dead(cvd)) { 328*789Sahrens rm->rm_missing_child = c; 329*789Sahrens rc->rc_error = ENXIO; 330*789Sahrens rc->rc_tried = 1; /* don't even try */ 331*789Sahrens rc->rc_skipped = 1; 332*789Sahrens continue; 333*789Sahrens } 334*789Sahrens if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) { 335*789Sahrens rm->rm_missing_child = c; 336*789Sahrens rc->rc_error = ESTALE; 337*789Sahrens rc->rc_skipped = 1; 338*789Sahrens continue; 339*789Sahrens } 340*789Sahrens if (c >= rm->rm_firstdatacol || rm->rm_missing_child != -1 || 341*789Sahrens (zio->io_flags & ZIO_FLAG_SCRUB)) { 342*789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 343*789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 344*789Sahrens zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, 345*789Sahrens vdev_raidz_child_done, rc)); 346*789Sahrens } 347*789Sahrens } 348*789Sahrens 349*789Sahrens zio_wait_children_done(zio); 350*789Sahrens } 351*789Sahrens 352*789Sahrens static void 353*789Sahrens vdev_raidz_io_done(zio_t *zio) 354*789Sahrens { 355*789Sahrens vdev_t *vd = zio->io_vd; 356*789Sahrens vdev_t *cvd; 357*789Sahrens raidz_map_t *rm = zio->io_vsd; 358*789Sahrens raidz_col_t *rc; 359*789Sahrens blkptr_t *bp = zio->io_bp; 360*789Sahrens int unexpected_errors = 0; 361*789Sahrens int c; 362*789Sahrens 363*789Sahrens ASSERT(bp != NULL); /* XXX need to add code to enforce this */ 364*789Sahrens 365*789Sahrens zio->io_error = 0; 366*789Sahrens zio->io_numerrors = 0; 367*789Sahrens 368*789Sahrens for (c = 0; c < rm->rm_cols; c++) { 369*789Sahrens rc = &rm->rm_col[c]; 370*789Sahrens 371*789Sahrens /* 372*789Sahrens * We preserve any EIOs because those may be worth retrying; 373*789Sahrens * whereas ECKSUM and ENXIO are more likely to be persistent. 374*789Sahrens */ 375*789Sahrens if (rc->rc_error) { 376*789Sahrens if (zio->io_error != EIO) 377*789Sahrens zio->io_error = rc->rc_error; 378*789Sahrens if (!rc->rc_skipped) 379*789Sahrens unexpected_errors++; 380*789Sahrens zio->io_numerrors++; 381*789Sahrens } 382*789Sahrens } 383*789Sahrens 384*789Sahrens if (zio->io_type == ZIO_TYPE_WRITE) { 385*789Sahrens /* 386*789Sahrens * If this is not a failfast write, and we were able to 387*789Sahrens * write enough columns to reconstruct the data, good enough. 388*789Sahrens */ 389*789Sahrens /* XXPOLICY */ 390*789Sahrens if (zio->io_numerrors <= rm->rm_firstdatacol && 391*789Sahrens !(zio->io_flags & ZIO_FLAG_FAILFAST)) 392*789Sahrens zio->io_error = 0; 393*789Sahrens 394*789Sahrens vdev_raidz_map_free(zio); 395*789Sahrens zio_next_stage(zio); 396*789Sahrens return; 397*789Sahrens } 398*789Sahrens 399*789Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ); 400*789Sahrens 401*789Sahrens /* 402*789Sahrens * If there were no I/O errors, and the data checksums correctly, 403*789Sahrens * the read is complete. 404*789Sahrens */ 405*789Sahrens /* XXPOLICY */ 406*789Sahrens if (zio->io_numerrors == 0 && zio_checksum_error(zio) == 0) { 407*789Sahrens ASSERT(unexpected_errors == 0); 408*789Sahrens ASSERT(zio->io_error == 0); 409*789Sahrens 410*789Sahrens /* 411*789Sahrens * We know the data's good. If we read the parity, 412*789Sahrens * verify that it's good as well. If not, fix it. 413*789Sahrens */ 414*789Sahrens for (c = 0; c < rm->rm_firstdatacol; c++) { 415*789Sahrens void *orig; 416*789Sahrens rc = &rm->rm_col[c]; 417*789Sahrens if (!rc->rc_tried) 418*789Sahrens continue; 419*789Sahrens orig = zio_buf_alloc(rc->rc_size); 420*789Sahrens bcopy(rc->rc_data, orig, rc->rc_size); 421*789Sahrens vdev_raidz_reconstruct(rm, c); 422*789Sahrens if (bcmp(orig, rc->rc_data, rc->rc_size) != 0) { 423*789Sahrens vdev_checksum_error(zio, 424*789Sahrens vd->vdev_child[rc->rc_col]); 425*789Sahrens rc->rc_error = ECKSUM; 426*789Sahrens unexpected_errors++; 427*789Sahrens } 428*789Sahrens zio_buf_free(orig, rc->rc_size); 429*789Sahrens } 430*789Sahrens goto done; 431*789Sahrens } 432*789Sahrens 433*789Sahrens /* 434*789Sahrens * If there was exactly one I/O error, it's the one we expected, 435*789Sahrens * and the reconstructed data checksums, the read is complete. 436*789Sahrens * This happens when one child is offline and vdev_fault_assess() 437*789Sahrens * knows it, or when one child has stale data and the DTL knows it. 438*789Sahrens */ 439*789Sahrens if (zio->io_numerrors == 1 && (c = rm->rm_missing_child) != -1) { 440*789Sahrens rc = &rm->rm_col[c]; 441*789Sahrens ASSERT(unexpected_errors == 0); 442*789Sahrens ASSERT(rc->rc_error == ENXIO || rc->rc_error == ESTALE); 443*789Sahrens vdev_raidz_reconstruct(rm, c); 444*789Sahrens if (zio_checksum_error(zio) == 0) { 445*789Sahrens zio->io_error = 0; 446*789Sahrens goto done; 447*789Sahrens } 448*789Sahrens } 449*789Sahrens 450*789Sahrens /* 451*789Sahrens * This isn't a typical error -- either we got a read error or 452*789Sahrens * more than one child claimed a problem. Read every block we 453*789Sahrens * haven't already so we can try combinatorial reconstruction. 454*789Sahrens */ 455*789Sahrens unexpected_errors = 1; 456*789Sahrens rm->rm_missing_child = -1; 457*789Sahrens 458*789Sahrens for (c = 0; c < rm->rm_cols; c++) 459*789Sahrens if (!rm->rm_col[c].rc_tried) 460*789Sahrens break; 461*789Sahrens 462*789Sahrens if (c != rm->rm_cols) { 463*789Sahrens zio->io_error = 0; 464*789Sahrens zio_vdev_io_redone(zio); 465*789Sahrens for (c = 0; c < rm->rm_cols; c++) { 466*789Sahrens rc = &rm->rm_col[c]; 467*789Sahrens if (rc->rc_tried) 468*789Sahrens continue; 469*789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, 470*789Sahrens vd->vdev_child[rc->rc_col], 471*789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 472*789Sahrens zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, 473*789Sahrens vdev_raidz_child_done, rc)); 474*789Sahrens } 475*789Sahrens zio_wait_children_done(zio); 476*789Sahrens return; 477*789Sahrens } 478*789Sahrens 479*789Sahrens /* 480*789Sahrens * If there were more errors than parity disks, give up. 481*789Sahrens */ 482*789Sahrens if (zio->io_numerrors > rm->rm_firstdatacol) { 483*789Sahrens ASSERT(zio->io_error != 0); 484*789Sahrens goto done; 485*789Sahrens } 486*789Sahrens 487*789Sahrens /* 488*789Sahrens * The number of I/O errors is correctable. Correct them here. 489*789Sahrens */ 490*789Sahrens ASSERT(zio->io_numerrors <= rm->rm_firstdatacol); 491*789Sahrens for (c = 0; c < rm->rm_cols; c++) { 492*789Sahrens rc = &rm->rm_col[c]; 493*789Sahrens ASSERT(rc->rc_tried); 494*789Sahrens if (rc->rc_error) { 495*789Sahrens vdev_raidz_reconstruct(rm, c); 496*789Sahrens if (zio_checksum_error(zio) == 0) 497*789Sahrens zio->io_error = 0; 498*789Sahrens else 499*789Sahrens zio->io_error = rc->rc_error; 500*789Sahrens goto done; 501*789Sahrens } 502*789Sahrens } 503*789Sahrens 504*789Sahrens /* 505*789Sahrens * There were no I/O errors, but the data doesn't checksum. 506*789Sahrens * Try all permutations to see if we can find one that does. 507*789Sahrens */ 508*789Sahrens ASSERT(zio->io_numerrors == 0); 509*789Sahrens for (c = 0; c < rm->rm_cols; c++) { 510*789Sahrens void *orig; 511*789Sahrens rc = &rm->rm_col[c]; 512*789Sahrens 513*789Sahrens orig = zio_buf_alloc(rc->rc_size); 514*789Sahrens bcopy(rc->rc_data, orig, rc->rc_size); 515*789Sahrens vdev_raidz_reconstruct(rm, c); 516*789Sahrens 517*789Sahrens if (zio_checksum_error(zio) == 0) { 518*789Sahrens zio_buf_free(orig, rc->rc_size); 519*789Sahrens zio->io_error = 0; 520*789Sahrens /* 521*789Sahrens * If this child didn't know that it returned bad data, 522*789Sahrens * inform it. 523*789Sahrens */ 524*789Sahrens if (rc->rc_tried && rc->rc_error == 0) 525*789Sahrens vdev_checksum_error(zio, 526*789Sahrens vd->vdev_child[rc->rc_col]); 527*789Sahrens rc->rc_error = ECKSUM; 528*789Sahrens goto done; 529*789Sahrens } 530*789Sahrens 531*789Sahrens bcopy(orig, rc->rc_data, rc->rc_size); 532*789Sahrens zio_buf_free(orig, rc->rc_size); 533*789Sahrens } 534*789Sahrens 535*789Sahrens /* 536*789Sahrens * All combinations failed to checksum. 537*789Sahrens */ 538*789Sahrens zio->io_error = ECKSUM; 539*789Sahrens 540*789Sahrens done: 541*789Sahrens zio_checksum_verified(zio); 542*789Sahrens 543*789Sahrens if (zio->io_error == 0 && (spa_mode & FWRITE) && 544*789Sahrens (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 545*789Sahrens /* 546*789Sahrens * Use the good data we have in hand to repair damaged children. 547*789Sahrens */ 548*789Sahrens for (c = 0; c < rm->rm_cols; c++) { 549*789Sahrens rc = &rm->rm_col[c]; 550*789Sahrens cvd = vd->vdev_child[rc->rc_col]; 551*789Sahrens 552*789Sahrens if (rc->rc_error) { 553*789Sahrens /* 554*789Sahrens * Make a copy of the data because we're 555*789Sahrens * going to free the RAID-Z map below. 556*789Sahrens */ 557*789Sahrens void *data = zio_buf_alloc(rc->rc_size); 558*789Sahrens bcopy(rc->rc_data, data, rc->rc_size); 559*789Sahrens 560*789Sahrens dprintf("%s resilvered %s @ 0x%llx error %d\n", 561*789Sahrens vdev_description(vd), 562*789Sahrens vdev_description(cvd), 563*789Sahrens zio->io_offset, rc->rc_error); 564*789Sahrens 565*789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 566*789Sahrens rc->rc_offset, data, rc->rc_size, 567*789Sahrens ZIO_TYPE_WRITE, zio->io_priority, 568*789Sahrens ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL | 569*789Sahrens ZIO_FLAG_DONT_PROPAGATE, 570*789Sahrens vdev_raidz_repair_done, NULL)); 571*789Sahrens } 572*789Sahrens } 573*789Sahrens } 574*789Sahrens 575*789Sahrens vdev_raidz_map_free(zio); 576*789Sahrens zio_next_stage(zio); 577*789Sahrens } 578*789Sahrens 579*789Sahrens static void 580*789Sahrens vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 581*789Sahrens { 582*789Sahrens if (faulted > 1) 583*789Sahrens vdev_set_state(vd, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); 584*789Sahrens else if (degraded + faulted != 0) 585*789Sahrens vdev_set_state(vd, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 586*789Sahrens else 587*789Sahrens vdev_set_state(vd, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 588*789Sahrens } 589*789Sahrens 590*789Sahrens vdev_ops_t vdev_raidz_ops = { 591*789Sahrens vdev_raidz_open, 592*789Sahrens vdev_raidz_close, 593*789Sahrens vdev_raidz_asize, 594*789Sahrens vdev_raidz_io_start, 595*789Sahrens vdev_raidz_io_done, 596*789Sahrens vdev_raidz_state_change, 597*789Sahrens VDEV_TYPE_RAIDZ, /* name of this vdev type */ 598*789Sahrens B_FALSE /* not a leaf vdev */ 599*789Sahrens }; 600